R/Import_Filter.R

Defines functions importFromGATK importFromTable importFromVCF

Documented in importFromGATK importFromTable

#' Imports SNP data from GATK VariablesToTable output
#'
#' Imports SNP data from the output of the
#' \href{https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_variantutils_VariantsToTable.php}{VariantsToTable}
#' function in GATK. After importing the data, the function then calculates
#' total reference allele frequency for both bulks together, the delta SNP index
#' (i.e. SNP index of the low bulk subtracted from the SNP index of the high
#' bulk), the G statistic and returns a data frame. The required GATK fields
#' (-F) are CHROM (Chromosome) and POS (Position). The required Genotype fields
#' (-GF) are AD (Allele Depth), DP (Depth). Recommended
#' fields are REF (Reference allele) and ALT (Alternative allele) Recommended
#' Genotype feilds are PL (Phred-scaled likelihoods) and GQ  (Genotype Quality).
#'
#' @param file The name of the GATK VariablesToTable output .table file which the
#'   data are to be read from.
#' @param highBulk The sample name of the High Bulk
#' @param lowBulk The sample name of the Low Bulk
#' @param chromList a string vector of the chromosomes to be used in the
#'   analysis. Useful for filtering out unwanted contigs etc.
#' @return Returns a data frame containing columns for Read depth (DP),
#'   Reference Allele Depth (AD_REF) and Alternative Allele Depth (AD_ALT),
#'   Genoytype Quality (GQ) and SNPindex for each bulk (indicated by .HIGH and
#'   .LOW column name suffix). Total reference allele frequnce "REF_FRQ" is the
#'   sum of AD.REF for both bulks divided by total Depth for that SNP. The
#'   deltaSNPindex is equal to  SNPindex.HIGH - SNPindex.LOW. The GStat column
#'   is the calculated G statistic for that SNP.
#' @seealso \code{\link{getG}} for explaination of how G statistic is
#'   calculated.
#'   \href{http://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it}{What
#'   is a VCF and how should I interpret it?} for more information on GATK
#'   Fields and Genotype Fields
#' @examples df <-  ImportFromGATK(filename = file.table,
#'     highBulk = highBulkSampleName,
#'     lowBulk = lowBulkSampleName,
#'     chromList = c("Chr1","Chr4","Chr7"))
#' @export importFromGATK

importFromGATK <- function(file,
    highBulk = character(),
    lowBulk = character(),
    chromList = NULL) {
    
    # first read one line to help define col types
    colheader <- read.delim(file, nrows=1, check.names=FALSE)
    
    # identify the sample name specific int and chr columns
    int_matches <- grep('DP|GQ', names(colheader), value=TRUE)
    chr_matches <- grep('\\.AD', names(colheader), value=TRUE)
    
    # create cols_spec class definitions
    int_cols <- do.call(readr::cols, setNames(
        rep(list(readr::col_integer()), length(int_matches)), 
        int_matches))$cols
    
    chr_cols <- do.call(readr::cols, setNames(
        rep(list(readr::col_character()), length(chr_matches)), 
        chr_matches))$cols
    
    col_defs <- readr::cols(CHROM = "c", POS = "i")
    col_defs$cols <- c(readr::cols(CHROM = "c", POS = "i")$cols,
                       int_cols,
                       chr_cols
    )
    
    SNPset <- readr::read_tsv(file = file, 
                              col_names = TRUE, 
                              guess_max = 10000,
                              col_types = col_defs)
    
    if (!all(
        c(
        "CHROM", 
        "POS", 
        paste0(highBulk, ".AD"), 
        paste0(lowBulk, ".AD"), 
        paste0(highBulk, ".DP"), 
        paste0(lowBulk, ".DP")
        ) %in% names(SNPset))) {
        stop("One of the required fields is missing. Check your table file.")
    }
    
# rename columns based on bulk names and flip headers (ie HIGH.AD -> AD.HIGH to match the rest of the functions
    colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")] <- 
        gsub(pattern = highBulk, 
             replacement = "HIGH", 
             x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])
    colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")] <- 
        gsub(pattern = lowBulk, 
             replacement = "LOW", 
             x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])
    
    colnames(SNPset) <-
        sapply(strsplit(colnames(SNPset), "[.]"),
            function(x) {paste0(rev(x),collapse = '.')})
    
    #Keep only wanted chromosomes
    if (!is.null(chromList)) {
        message("Removing the following chromosomes: ", paste(unique(SNPset$CHROM)[!unique(SNPset$CHROM) %in% chromList], collapse = ", "))
        SNPset <- SNPset[SNPset$CHROM %in% chromList, ]
    }
    #arrange the chromosomes by natural order sort, eg Chr1, Chr10, Chr2 >>> Chr1, Chr2, Chr10
    SNPset$CHROM <-
        factor(SNPset$CHROM, levels = gtools::mixedsort(unique(SNPset$CHROM)))
    
    SNPset <- SNPset %>%
        tidyr::separate(
            col = AD.LOW,
            into = "AD_REF.LOW",
            sep = ",",
            extra = "drop",
            convert = TRUE
        ) %>%
        tidyr::separate(
            col = AD.HIGH,
            into = "AD_REF.HIGH",
            sep = ",",
            extra = "drop",
            convert = TRUE
        ) %>%
        dplyr::mutate(
            AD_ALT.HIGH = DP.HIGH - AD_REF.HIGH,
            AD_ALT.LOW = DP.LOW - AD_REF.LOW,
            SNPindex.HIGH = AD_ALT.HIGH / DP.HIGH,
            SNPindex.LOW = AD_ALT.LOW / DP.LOW,
            REF_FRQ = (AD_REF.HIGH + AD_REF.LOW) / (DP.HIGH + DP.LOW),
            deltaSNP = SNPindex.HIGH - SNPindex.LOW
        ) %>%
        dplyr::select(
            -dplyr::contains("HIGH"),
            -dplyr::contains("LOW"),
            -dplyr::one_of("deltaSNP", "REF_FRQ"),
            dplyr::matches("AD.*.LOW"),
            dplyr::contains("LOW"),
            dplyr::matches("AD.*.HIGH"),
            dplyr::contains("HIGH"),
            dplyr::everything()
        )
    
    return(as.data.frame(SNPset))
}


#' Import SNP data from a delimited file
#'
#' After importing the data from a delimited file, the function then calculates
#' total reference allele frequency for both bulks together, the delta SNP index
#' (i.e. SNP index of the low bulk subtracted from the SNP index of the high
#' bulk), the G statistic and returns a data frame. The required columns in the
#' file are CHROM (Chromosome) and POS (Position) as well as the reference and
#' alternate allele depths (number of reads supporting each allele). The allele
#' depths should be in columns named in this format:
#' \code{AD_(<ALT/REF>).<sampleName>}. For example, the column for alternate
#' allele depth for a high bulk sample named "sample1", should be
#' "AD_ALT.sample1". Any other columns describing the SNPs are allowed, ie the
#' actual allele calls, or a quality score. If the column is Bulk specific, It
#' should be named \code{columnName.sampleName}, i.e "QUAL.sample1".
#'
#' @param file The name of the file which the data are to be read from.
#' @param highBulk The sample name of the High Bulk. Defaults to "HIGH"
#' @param lowBulk The sample name of the Low Bulk. Defaults to "LOW"
#' @param chromList a string vector of the chromosomes to be used in the
#'   analysis. Useful for filtering out unwanted contigs etc.
#' @param sep the field separator character. Values on each line of the file are
#'   separated by this character. Default is for csv file ie ",".
#' @return Returns a data frame containing columns for per bulk total Read depth (DP),
#'   Reference Allele Depth (AD_REF) and Alternative Allele Depth (AD_ALT), any
#'   other SNP associated columns in the file, and SNPindex for each bulk
#'   (indicated by .HIGH and .LOW column name suffix). Total reference allele
#'   frequnce "REF_FRQ" is the sum of AD_REF for both bulks divided by total
#'   Depth for that SNP. The deltaSNPindex is equal to  SNPindex.HIGH -
#'   SNPindex.LOW.
#'
#' @export importFromTable

importFromTable <-
    function(file,
             highBulk = "HIGH",
             lowBulk = "LOW",
             chromList = NULL,
             sep = ",") {
        SNPset <-
            readr::read_delim(
                file = file,
                delim = sep,
                col_names = TRUE,
                col_types = readr::cols(
                    .default = readr::col_guess(),
                    CHROM = "c",
                    POS = "i"
                )
            )
        # check CHROM
        if (!"CHROM" %in% names(SNPset)) {
            stop("No 'CHROM' coloumn found.")
        }
        
        # check POS
        if (!"POS" %in% names(SNPset)) {
            stop("No 'POS' coloumn found.")
        }
        
        # check AD_REF.HIGH
        if (!paste0("AD_REF.", highBulk) %in% names(SNPset)) {
            stop(
                "No High Bulk AD_REF coloumn found. Column should be named 'AD_REF.highBulkName'."
            )
        }
        
        # check AD_REF.LOW
        if (!paste0("AD_REF.", lowBulk) %in% names(SNPset)) {
            stop("No Low Bulk AD_REF coloumn found. Column should be named 'AD_REF.lowBulkName'.")
        }
        
        #check AD_ALT.HIGH
        if (!paste0("AD_ALT.", highBulk) %in% names(SNPset)) {
            stop(
                "No High Bulk AD_REF coloumn found. Column should be named 'AD_ALT.highBulkName'."
            )
        }
        
        # check AD_ALT.LOW
        if (!paste0("AD_ALT.", lowBulk) %in% names(SNPset)) {
            stop("No Low Bulk AD_ALT coloumn found. Column should be named 'AD_ALT.lowBulkName'.")
        }
        
        # Keep only wanted chromosomes
        if (!is.null(chromList)) {
            message("Removing the following chromosomes: ",
                    paste(unique(SNPset$CHROM)[!unique(SNPset$CHROM) %in% chromList], collapse = ", "))
            SNPset <- SNPset[SNPset$CHROM %in% chromList, ]
        }
        # arrange the chromosomes by natural order sort, eg Chr1, Chr10, Chr2 >>> Chr1, Chr2, Chr10
        SNPset$CHROM <-
            factor(SNPset$CHROM, levels = gtools::mixedsort(unique(SNPset$CHROM)))
        
        # Rename columns
        message("Renaming the following columns: ",
                paste(colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")][grep(highBulk, x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])], collapse = ", "))
        colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")] <-
            gsub(pattern = highBulk,
                 replacement = "HIGH",
                 x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])
        
        message("Renaming the following columns: ",
                paste(colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")][grep(lowBulk, x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])], collapse = ", "))
        colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")] <-
            gsub(pattern = lowBulk,
                 replacement = "LOW",
                 x = colnames(SNPset)[!colnames(SNPset) %in% c("CHROM", "POS", "REF", "ALT")])
        
        # calculate DPs
        SNPset <- SNPset %>%
            dplyr::mutate(
                DP.HIGH = AD_REF.HIGH + AD_ALT.HIGH,
                DP.LOW = AD_REF.LOW + AD_ALT.LOW,
                SNPindex.HIGH = AD_ALT.HIGH / DP.HIGH,
                SNPindex.LOW = AD_ALT.LOW / DP.LOW,
                REF_FRQ = (AD_REF.HIGH + AD_REF.LOW) / (DP.HIGH + DP.LOW),
                deltaSNP = SNPindex.HIGH - SNPindex.LOW
            ) %>%
            dplyr::select(
                -dplyr::contains("HIGH"),-dplyr::contains("LOW"),-dplyr::one_of("deltaSNP", "REF_FRQ"),
                dplyr::matches("AD.*.LOW"),
                dplyr::contains("LOW"),
                dplyr::matches("AD.*.HIGH"),
                dplyr::contains("HIGH"),
                dplyr::everything()
            )
        
        return(as.data.frame(SNPset))
    }


## not exported still only works for GATK...
importFromVCF <- function(file,
                          highBulk = character(),
                          lowBulk = character(),
                          chromList = NULL) {
    
    vcf <- vcfR::read.vcfR(file = file)
    message("Keeping SNPs that pass all filters")
    vcf <- vcf[vcf@fix[, "FILTER"] == "PASS"] 
    
    fix <- dplyr::as_tibble(vcf@fix[, c("CHROM", "POS", "REF", "ALT")]) %>% mutate(Key = seq(1:nrow(.)))
    
    # if (!all(
    #     c(
    #         "CHROM", 
    #         "POS", 
    #         paste0(highBulk, ".AD"), 
    #         paste0(lowBulk, ".AD"), 
    #         paste0(highBulk, ".DP"), 
    #         paste0(lowBulk, ".DP")
    #     ) %in% names(SNPset))) {
    #     stop("One of the required fields is missing. Check your VCF file.")
    # }
    
    tidy_gt <- extract_gt_tidy(vcf, format_fields = c("AD", "DP", "GQ"), gt_column_prepend = "", alleles = FALSE)
    
    SNPset <- tidy_gt %>%
        filter(Indiv == LowBulk) %>% select(-Indiv) %>%
        dplyr::left_join(select(filter(tidy_gt, Indiv == HighBulk),-Indiv),
                         by = "Key",
                         suffix = c(".LOW", ".HIGH")) %>%
        tidyr::separate(
            col = "AD.LOW",
            into = c("AD_REF.LOW", "AD_ALT.LOW"),
            sep = ",",
            extra = "merge",
            convert = TRUE
        ) %>%
        tidyr::separate(
            col = "AD.HIGH",
            into = c("AD_REF.HIGH", "AD_ALT.HIGH"),
            sep = ",",
            extra = "merge", 
            convert = TRUE
        ) %>%
        dplyr::full_join(x = fix, by = "Key") %>%
        dplyr::mutate(
            AD_ALT.HIGH = DP.HIGH - AD_REF.HIGH,
            AD_ALT.LOW = DP.LOW - AD_REF.LOW,
            SNPindex.HIGH = AD_ALT.HIGH / DP.HIGH,
            SNPindex.LOW = AD_ALT.LOW / DP.LOW,
            REF_FRQ = (AD_REF.HIGH + AD_REF.LOW) / (DP.HIGH + DP.LOW),
            deltaSNP = SNPindex.HIGH - SNPindex.LOW
        ) %>%
        select(-Key)
    #Keep only wanted chromosomes
    if (!is.null(chromList)) {
        message("Removing the following chromosomes: ", paste(unique(SNPset$CHROM)[!unique(SNPset$CHROM) %in% chromList], collapse = ", "))
        SNPset <- SNPset[SNPset$CHROM %in% chromList, ]
    }
    as.data.frame(SNPset)
}


#' Filter SNPs based on read depth and quality
#'
#' Use filtering paramaters to filter out high and low depth reads as well as
#' low Genotype Quality as defined by GATK. All filters are optional but recommended.
#'
#' @param SNPset The data frame imported by \code{ImportFromGATK}
#' @param refAlleleFreq A numeric < 1. This will filter out SNPs with a
#'   Reference Allele Frequency less than \code{refAlleleFreq} and greater than
#'   1 - \code{refAlleleFreq}. Eg. \code{refAlleleFreq = 0.3} will keep SNPs
#'   with 0.3 <= REF_FRQ <= 0.7
#' @param filterAroundMedianDepth Filters total SNP read depth for both bulks. A
#'   median and median absolute deviation (MAD) of depth will be calculated.
#'   SNPs with read depth greater or less than \code{filterAroundMedianDepth}
#'   MADs away from the median will be filtered.
#' @param minTotalDepth The minimum total read depth for a SNP (counting both
#'   bulks)
#' @param maxTotalDepth The maximum total read depth for a SNP (counting both
#'   bulks)
#' @param minSampleDepth The minimum read depth for a SNP in each bulk
#' @param depthDifference The maximum absolute difference in read depth between the bulks.
#' @param minGQ The minimum Genotype Quality as set by GATK. This is a measure
#'   of how confident GATK was with the assigned genotype (i.e. homozygous ref,
#'   heterozygous, homozygous alt). See
#'   \href{http://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it}{What
#'   is a VCF and how should I interpret it?}
#' @param verbose logical. If \code{TRUE} will report number of SNPs filtered in
#'   each step.
#' @return Returns a subset of the data frame supplied which meets the filtering
#'   conditions applied by the selected parameters. If \code{verbose} is
#'   \code{TRUE} the function reports the number of SNPs filtered in each step
#'   as well as the initiatl number of SNPs, the total number of SNPs filtered
#'   and the remaining number.
#'
#' @seealso See \code{\link[stats]{mad}} for explaination of calculation of
#'   median absolute deviation.
#'   \href{http://gatkforums.broadinstitute.org/gatk/discussion/1268/what-is-a-vcf-and-how-should-i-interpret-it}{What
#'   is a VCF and how should I interpret it?} for more information on GATK
#'   Fields and Genotype Fields
#' @examples df_filt <- FilterSNPs(
#'     df,
#'     refAlleleFreq = 0.3,
#'     minTotalDepth = 40,
#'     maxTotalDepth = 80,
#'     minSampleDepth = 20,
#'     minGQ = 99,
#'     verbose = TRUE
#' )
#'
#' @export filterSNPs

filterSNPs <- function(SNPset,
    refAlleleFreq,
    filterAroundMedianDepth,
    minTotalDepth,
    maxTotalDepth,
    minSampleDepth,
    depthDifference,
    minGQ,
    verbose = TRUE) {
    
    org_count <- nrow(SNPset)
    count <- nrow(SNPset)
    
    # Filter by total reference allele frequency
    if (!missing(refAlleleFreq)) {
        if (verbose) {
            message(
                "Filtering by reference allele frequency: ",
                refAlleleFreq,
                " <= REF_FRQ <= ",
                1 - refAlleleFreq
            )
        }
        SNPset <- dplyr::filter(SNPset, SNPset$REF_FRQ < 1 - refAlleleFreq &
                SNPset$REF_FRQ > refAlleleFreq)
            if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)
    }
    
    #Total read depth filtering
    
    if (!missing(filterAroundMedianDepth)) {
        # filter by Read depth for each SNP FilterByMAD MADs around the median
        madDP <-
            mad(
                x = (SNPset$DP.HIGH + SNPset$DP.LOW),
                constant = 1,
                na.rm = TRUE
            )
        medianDP <-
            median(x = (SNPset$DP.HIGH + SNPset$DP.LOW),
                na.rm = TRUE)
        maxDP <- medianDP + filterAroundMedianDepth * madDP
        minDP <- medianDP - filterAroundMedianDepth * madDP
        SNPset <- dplyr::filter(SNPset, (DP.HIGH + DP.LOW) <= maxDP &
                (DP.HIGH + DP.LOW) >= minDP)

        if(verbose) {message("Filtering by total read depth: ",
            filterAroundMedianDepth,
            " MADs arround the median: ", minDP, " <= Total DP <= ", maxDP)
            message("...Filtered ", count - nrow(SNPset), " SNPs")}
        count <- nrow(SNPset)
        
    }
    
    if (!missing(minTotalDepth)) {
        # Filter by minimum total SNP depth
        if (verbose) {
            message("Filtering by total sample read depth: Total DP >= ",
                minTotalDepth)
        }
        SNPset <-
            dplyr::filter(SNPset, (DP.HIGH + DP.LOW) >= minTotalDepth)

        if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)
    }
    
    if (!missing(maxTotalDepth)) {
        # Filter by maximum total SNP depth
        if (verbose) {
            message("Filtering by total sample read depth: Total DP <= ",
                maxTotalDepth)
        }
        SNPset <-
            dplyr::filter(SNPset, (DP.HIGH + DP.LOW) <= maxTotalDepth)
        if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)
    }
    
    
    # Filter by min read depth in either sample
    if (!missing(minSampleDepth)) {
        if (verbose) {
            message("Filtering by per sample read depth: DP >= ",
                minSampleDepth)
        }
        SNPset <-
            dplyr::filter(SNPset, DP.HIGH >= minSampleDepth &
                    SNPset$DP.LOW >= minSampleDepth)
        if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)
    }
    
    # Filter by Genotype Quality
    if (!missing(minGQ)) {
        if (all(c("GQ.LOW", "GQ.HIGH") %in% names(SNPset))) {
        if (verbose) {
            message("Filtering by Genotype Quality: GQ >= ", minGQ)
        }
        SNPset <-
            dplyr::filter(SNPset, GQ.LOW >= minGQ & GQ.HIGH >= minGQ)
        if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)} 
        else {
            message("GQ columns not found. Skipping...")
        }
    }
    
    # Filter by difference between high and low bulks
    if (!missing(depthDifference)) {
        if (verbose) {
            message("Filtering by difference between bulks <= ",
                    depthDifference)
        }
        SNPset <-
            dplyr::filter(SNPset, abs(DP.HIGH - DP.LOW) <= depthDifference)
        if (verbose) {
            message("...Filtered ", count - nrow(SNPset), " SNPs")
        }
        count <- nrow(SNPset)
    }
    
    # #Filter SNP Clusters
    # if (!is.null(SNPsInCluster) & !is.null(ClusterWin)) {
    #     tmp <- which(diff(SNPset$POS, SNPsInCluster-1) < ClusterWin)
    # message("...Filtered ", count - nrow(SNPset), " SNPs")
    # count <- nrow(SNPset)
    # }
    if (verbose) {
        message(
            "Original SNP number: ",
            org_count,
            ", Filtered: ",
            org_count - count,
            ", Remaining: ",
            count
        )
    }
    return(as.data.frame(SNPset))
}
bmansfeld/QTLseqR documentation built on Jan. 25, 2020, 8:52 p.m.