setMethod("autosomalRecessiveHomozygous", signature(param="VariantFilteringParam"),
function(param, svparam=ScanVcfParam(),
use=c("everything", "complete.obs", "all.obs"),
includeHomRef=FALSE,
age.of.onset, phenocopies,
BPPARAM=bpparam("SerialParam")) {
use <- match.arg(use)
## store call for reproducing it later
callobj <- match.call()
callstr <- gsub(".local", "autosomalRecessiveHomozygous", deparse(callobj))
## fetch necessary parameters
vcfFiles <- param$vcfFiles
ped <- param$pedFilename
seqInfos <- param$seqInfos
txdb <- get(param$txdb)
bsgenome <- get(param$bsgenome)
sampleNames <- param$sampleNames
if (!exists(as.character(substitute(BPPARAM))[1]))
stop(sprintf("Parallel back-end function %s given in argument 'BPPARAM' does not exist in the current workspace. Either you did not write correctly the function name or you did not load the package 'BiocParallel'.", as.character(substitute(BPPARAM))))
if (length(vcfFiles) > 1)
stop("More than one input VCF file is currently not supported. Please either merge the VCF files into a single one with software such as vcftools or GATK, or do the variant calling simultaneously on all samples, or proceed analyzing each file separately.")
else if (length(vcfFiles) < 1)
stop("A minimum of 1 VCF file has to be provided.")
if (is.na(ped))
stop("Please specify a PED file name when building the parameter object.")
pedDf <- .readPEDfile(ped)
#unaff <- pedDf[pedDf$Phenotype == 1, ]
#aff <- pedDf[pedDf$Phenotype == 2, ]
annotationCache <- new.env() ## cache annotations when using VariantAnnotation::locateVariants()
annotated_variants <- VRanges()
metadata(mcols(annotated_variants)) <- list(cutoffs=CutoffsList(), sortings=CutoffsList())
open(vcfFiles[[1]])
n.var <- 0
flag <- TRUE
while (flag && nrow(vcf <- readVcf(vcfFiles[[1]], genome=seqInfos[[1]], param=svparam))) {
## insert an index for each variant in the VCF file
info(header(vcf)) <- rbind(info(header(vcf)),
DataFrame(Number=1, Type="Integer",
Description="Variant index in the VCF file.",
row.names="VCFIDX"))
info(vcf)$VCFIDX <- (n.var+1):(n.var+nrow(vcf))
varIDs <- rownames(vcf)
n.var <- n.var + nrow(vcf)
mask <- .autosomalRecessiveHomozygousMask(vcf, pedDf, bsgenome, use,
includeHomRef, phenocopies,
age.of.onset)
if (any(mask)) {
## filter out variants that do not segregate as an autosomal recessive homozygous trait
vcf <- vcf[mask, ]
## coerce the VCF object to a VRanges object
variants <- as(vcf, "VRanges")
## since the conversion of VCF to VRanges strips the VCF ID field, let's put it back
variants$VARID <- varIDs[variants$VCFIDX]
## harmonize Seqinfo data between variants, annotations and reference genome
variants <- .matchSeqinfo(variants, txdb, bsgenome)
## annotate variants
if (length(annotated_variants) > 0)
annotated_variants <- c(annotated_variants, annotationEngine(variants, param, annotationCache,
BPPARAM=BPPARAM))
else
annotated_variants <- annotationEngine(variants, param, annotationCache,
BPPARAM=BPPARAM)
}
if (length(vcfWhich(svparam)) > 0) ## temporary fix to keep this looping
flag <- FALSE ## structure with access through genomic ranges
message(sprintf("%d variants processed", n.var))
}
close(vcfFiles[[1]])
gSO <- annotateSO(annotated_variants, sog(param))
annotated_variants <- addSOmetadata(annotated_variants)
if (length(annotated_variants) == 0)
warning("No variants segregate following an autosomal recessive homozygous inheritance model.")
annoGroups <- list()
if (!is.null(mcols(mcols(annotated_variants))$TAB)) {
mask <- !is.na(mcols(mcols(annotated_variants))$TAB)
annoGroups <- split(colnames(mcols(annotated_variants))[mask],
mcols(mcols(annotated_variants))$TAB[mask])
}
## add functional annotation filters generated by the annotation engine
funFilters <- FilterRules(lapply(metadata(mcols(annotated_variants))$filters,
function(f) { environment(f) <- baseenv() ; f}))
active(funFilters) <- FALSE ## by default, functional annotation filters are inactive
flt <- c(filters(param), funFilters)
fltMd <- rbind(filtersMetadata(param),
DataFrame(Description=sapply(metadata(mcols(annotated_variants))$filters,
attr, "description"),
AnnoGroup=sapply(metadata(mcols(annotated_variants))$filters,
attr, "TAB")))
cutoffs <- metadata(mcols(annotated_variants))$cutoffs
sortings <- metadata(mcols(annotated_variants))$sortings
bsgenome <- get(param$bsgenome)
new("VariantFilteringResults", callObj=callobj, callStr=callstr,
genomeDescription=as(bsgenome, "GenomeDescription"), inputParameters=param,
activeSamples=sampleNames, inheritanceModel="autosomal recessive homozygous",
variants=annotated_variants, bamViews=BamViews(), gSO=gSO, filters=flt,
filtersMetadata=fltMd, cutoffs=cutoffs, sortings=sortings, annoGroups=annoGroups,
minScore5ss=NA_real_, minScore3ss=NA_real_)
})
## build a logical mask whose truth values correspond to variants that segregate
## according to an autosomal recessive homozygous inheritance model
.autosomalRecessiveHomozygousMask <- function(vObj, pedDf, bsgenome,
use=c("everything", "complete.obs", "all.obs"),
includeHomRef=FALSE,
phenocopies, ## the phenocopies argument is experimental
age.of.onset) { ## the age.of.onset argument is experimental
use <- match.arg(use)
if (class(vObj) != "VRanges" && class(vObj) != "CollapsedVCF")
stop("Argument 'vObj' should be either a 'VRanges' or a 'CollapsedVCF' object.")
#stopifnot(all(colnames(pedDf) %in% c("FamilyID", "IndividualID", "FatherID", "MotherID", "Sex", "Phenotype"))) ## QC
nsamples <- nvariants <- 0
if (class(vObj) == "VRanges") {
nsamples <- nlevels(sampleNames(vObj))
nvariants <- length(vObj)
} else if (class(vObj) == "CollapsedVCF") {
nsamples <- as.integer(ncol(vObj))
nvariants <- nrow(vObj)
}
## assuming Phenotype == 2 means affected and Phenotype == 1 means unaffected
if (sum(pedDf$Phenotype == 2) < 1)
stop("No affected individuals detected. Something is wrong with the PED file.")
if (missing(age.of.onset)){
unaff <- pedDf[pedDf$Phenotype == 1, ]
aff <- pedDf[pedDf$Phenotype == 2, ]
} else {
if (is.null(pedDf$Age))
stop("If the 'age.of.onset' argument is given, the PED file should have an 'Age' column.")
## If age.of.onset is specified
## Healthy individuals below the age of disease onset show an uncertain phenotype which is rewritten as unknown.
pedDf[which(pedDf$Age<age.of.onset & pedDf$Phenotype==1),]$Phenotype <- 0
unaff <- pedDf[pedDf$Phenotype == 1, ]
aff <- pedDf[pedDf$Phenotype == 2, ]
}
## restrict upfront variants to those in autosomal chromosomes
## we subset to the first element of the value returned by seqlevelsStyle()
## to deal with cases in which only a subset of chromosomes is contained in
## the input VCF (typically for teaching/example/illustration purposes) which
## matches more than one chromosome style, or because Ensembl is identical to NCBI for human :\
snames <- as.character(seqnames(vObj))
autosomalMask <- snames %in% extractSeqlevelsByGroup(organism(bsgenome),
seqlevelsStyle(vObj)[1],
group="auto")
## build logical mask for variants that segregate as an autosomal recessive homozygous trait
arhomMask <- vector(mode="logical", length=nvariants) ## assume default values are FALSE
if (!any(autosomalMask))
return(arhomMask)
## fetch genotypes
gt <- NULL
if (class(vObj) == "VRanges")
gt <- do.call("cbind", split(vObj$GT[autosomalMask], sampleNames(vObj)))
else if (class(vObj) == "CollapsedVCF")
gt <- geno(vObj)$GT[autosomalMask, , drop=FALSE]
## further restrict affected and unaffected individuals to
## those who have been genotyped
gtind <- colnames(gt)
unaff <- unaff[unaff$IndividualID %in% gtind, , drop=FALSE]
aff <- aff[aff$IndividualID %in% gtind, , drop=FALSE]
if (nrow(aff) == 0)
stop("No affected individuals have genotypes.")
phasedgt <- any(grepl("\\|", gt))
## the phenocopies argument is experimental
naff <- as.integer(nrow(aff))
if (missing(phenocopies) || is.null(phenocopies))
phenocopies <- 0L
if (class(phenocopies) == "numeric" && (phenocopies < 0 || phenocopies > 0.5))
stop("When phenocopies is a real number, then it should take values > 0 and < 0.5.")
else if (class(phenocopies) == "integer" && (phenocopies < 0 || phenocopies > naff/2))
stop(sprintf("When phenocopies is an integer (%d), then it should take values >= 0 and <= %d (# of genotyped samples from half of the affected individuals).",
phenocopies, naff/2))
if (class(phenocopies) == "numeric")
phenocopies <- ceiling(phenocopies*naff)
missingMask <- rowSums(gt == ".") > 0
if (phasedgt)
missingMask <- missingMask | (rowSums(gt == ".|.") > 0)
else
missingMask <- missingMask | (rowSums(gt == "./.") > 0)
## missingMask <- apply(gt, 1, function(x) any(x == "." | x == "./." | x == ".|."))
if (any(missingMask) && use == "all.obs")
stop("There are missing genotypes and current policy to deal with them is 'all.obs', which does not allow them.")
## build logical masks of carriers (unaffected) and affected individuals
## variants in carriers should be heterozygous and affected should be homozygous alternative
carriersMask <- rep(TRUE, times=nrow(gt))
if (nrow(unaff) > 0) {
unaffgt <- gt[, unaff$IndividualID, drop=FALSE]
if (any(missingMask) && use == "everything") {
unaffgt[unaffgt == "."] <- NA_character_
if (phasedgt)
unaffgt[unaffgt == ".|."] <- NA_character_
else
unaffgt[unaffgt == "./."] <- NA_character_
}
## old mask assuming biallelic variants
## carriersMask <- unaffgt == "0/1" | unaffgt == "0|1" | unaffgt == "1|0"
## carriers mask is built taking into account multiallelic variants that typically happen in indels,
## i.e., 0/1, 0/2, 0/3, etc.
if (phasedgt)
carriersMask <- matrix(grepl("0\\|\\|0", unaffgt), nrow=nrow(unaffgt)) & unaffgt != "0|0"
else
carriersMask <- matrix(grepl("0/", unaffgt), nrow=nrow(unaffgt)) & unaffgt != "0/0"
carriersMask <- rowSums(carriersMask, na.rm=TRUE) == nrow(unaff)
rm(unaffgt)
}
affgt <- gt[, aff$IndividualID, drop=FALSE]
if (any(missingMask)) { ## && use == "everything") {
affgt[affgt == "."] <- NA_character_
if (phasedgt)
affgt[affgt == ".|."] <- NA_character_
else
affgt[affgt == "./."] <- NA_character_
}
## old mask assuming biallelic variants
## affectedMask <- affgt == "1/1" | affgt == "1|1"
## affected mask is built taking into account multiallelic variants that typically happen in indels,
## i.e., 1/1, 2/2, 3/3, etc.
if (phasedgt) {
if (!includeHomRef)
affgt[affgt == "0|0"] <- NA_character_
affgt <- strsplit(affgt, "|")
} else {
if (!includeHomRef)
affgt[affgt == "0/0"] <- NA_character_
affgt <- strsplit(affgt, "/")
}
naMask <- sapply(affgt, function(x) any(is.na(x)))
affectedMask <- vector(mode="logical", length=length(naMask))
affectedMask[!naMask] <- as.vector(tapply(unlist(affgt[!naMask]), rep(1:sum(!naMask), each=2),
function(x) all(x[1] == x[2])))
affectedMask <- matrix(affectedMask, ncol=nrow(aff))
if (phenocopies > 0L)
affectedMask <- rowSums(affectedMask, na.rm=TRUE) >= nrow(aff)-phenocopies
else
affectedMask <- rowSums(affectedMask, na.rm=TRUE) == nrow(aff)
rm(affgt)
caMask <- carriersMask & affectedMask
if (any(missingMask) && use == "complete.obs")
caMask <- caMask & !missingMask
## variants ultimately set to NA are discarded (should this be tuned by an argument?)
caMask[is.na(caMask)] <- FALSE
if (class(vObj) == "VRanges") {
nauto <- sum(autosomalMask)
idx <- split(1:nauto, sampleNames(vObj[autosomalMask]))
mask <- vector(mode="logical", length=nauto)
mask[unlist(idx, use.names=FALSE)] <- rep(caMask, times=nsamples)
arhomMask[autosomalMask] <- mask
} else if (class(vObj) == "CollapsedVCF")
arhomMask[autosomalMask] <- caMask
else
warning(paste(sprintf("object 'vObj' has class %s, unknown to this function.",
class(vObj)),
"As a consequence, no variants are selected as autosomal recessive homozygous."))
arhomMask
}
.autosomalRecessiveHomozygousFilter <- function(x) {
pedFilename <- param(x)$pedFilename
if (is.null(pedFilename))
stop("Please specify a PED file name in the parameter object.")
pedDf <- .readPEDfile(param(x)$pedFilename)
.autosomalRecessiveHomozygousMask(vObj=allVariants(x, groupBy="nothing"), pedDf=pedDf,
bsgenome=param(x)$bsgenome, use="everything")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.