#'Create a mutationCalls objects from nucleotide base calls and
#' defines a exclusionlist (cohort)
#'
#'Identifies relevant mitochondrial somatic variants from raw counts
#' of nucleotide frequencies measured in single cells from several
#' individuals. Applies two sets of filters: In the first step,
#' filters on coverage to include potentially noisy variants; in the
#' second step, compares allele frequencies between patients to remove
#' variants that were observed in several individuals and that
#' therefore are unlikely to represent true somatic variants (e.g. RNA
#' editing events). The exclusionlist derived from the original Velten
#' et al. 2021 dataset is available internal and can be used on single
#' individuals using \code{\link{mutationCallsFromExclusionlist}}
#'@param BaseCounts A list of base call matrices (one matrix per cell)
#'as produced by \code{\link{baseCountsFromBamList}} or
#'\code{\link{bam2R_10x}}.
#'@param patient A character vector associating each cell / entry in
#'the \code{BaseCount} list with a patient
#'@param sites Vector specifying genomic regions, defaults to the
#'entire mitochondrial genome. Excepts a string but may be
#'included as a GRanges object.
#'@param MINREADS Minimum number of reads on a site in a single cell
#'to qualify the site as covered
#'@param MINCELL Minimum number of cells across the whole data set to
#'cover a site
#'@param MINFRAC Fraction of reads on the mutant allele to
#'provisionally classify a cell as mutant
#'@param MINCELLS.PATIENT Minimum number of mutant cells per patient
#'to classify the mutation as relevant in that patient, AND
#'@param MINFRAC.PATIENT Minimum fraction of mutant cells per patient
#'to classify the mutation as relevant in that patient
#'@param MINFRAC.OTHER Minimum fraction of mutant cells identified in
#'a second patient for the mutation to be excluded. Fraction
#'relative to the fraction of of cells from the patient where a
#'variant is enriched.
#'@param USE.REFERENCE Boolean. The variant calls will be of the
#'format REF>ALT where REF is decided based on the selected
#'\code{genome} annotation. If set to FALSE, the reference allele
#'will be the most abundant.
#'@param genome The mitochondrial genome of the sample being
#'investigated. Please note that this is the UCSC standard
#'chromosome sequence. Default: hg38.
#'@param customGenome A GRanges object containing a custom annotation. If
#'provided, this genome will be used instead of the predefined options
#'specified by the `genome` parameter. Default is NULL.
#'@return A list of \code{\link{mutationCalls}} objects (one for each
#'\code{patient}) and an entry named \code{exclusionlist}
#'containing a exclusionlist of sites with variants in several
#'individuals
#'@examples sites.gr <- GenomicRanges::GRanges("chrM:1-15000")
#'BaseCounts <- bam2R_10x(file = system.file("extdata",
#'"mm10_10x.bam", package="mitoClone2"), sites=sites.gr)
#'mutCalls <- mutationCallsFromCohort(BaseCounts,
#'patient=c('sample2','sample1','sample2','sample2','sample1','sample2'),
#'MINCELL=1, MINFRAC=0, MINCELLS.PATIENT=1, genome='mm10',
#'sites=sites.gr)
#'@export
mutationCallsFromCohort <- function(BaseCounts,
sites,
patient,
MINREADS = 5,
MINCELL = 20,
MINFRAC = 0.1,
MINCELLS.PATIENT = 10,
MINFRAC.PATIENT = 0.01,
MINFRAC.OTHER = 0.1,
USE.REFERENCE = TRUE,
genome = 'hg38',
customGenome = NULL) {
message("Making sure 'sites' parameter is set correctly...")
if (!length(sites) == 1) {
stop('Your sites parameter must be a character vector or GRanges object of length 1')
}
sites <- GenomicRanges::GRanges(sites)
## read in the
ntcountsArray <- simplify2array(BaseCounts)
ntcountsArray <- aperm(ntcountsArray, c(1, 3, 2))
ntcountsArray <- ntcountsArray[, , c('A', 'T', 'C', 'G', 'N')]
if (USE.REFERENCE) {
if (!is.null(customGenome)) {
# Use custom genome if provided
reference <- customGenome
} else {
reference <- switch(genome,
"hg38" = hg38.dna,
"hg19" = hg19.dna,
"mm10" = mm10.dna)
}
reference <-
reference[GenomicRanges::start(sites):GenomicRanges::end(sites)]
message(paste0(
'Looks good. Using the UCSC ',
genome,
' genome as a reference for variants.',
' Be wary, only mitochondrial annotations are available!'
))
} else{
message(paste0(
'Looks good. However, the mutation names are run specific and may include N.'
))
totalntCounts <- apply(ntcountsArray, c(1, 3), sum)
reference <-
colnames(totalntCounts)[apply(totalntCounts, 1, which.max)]
}
## get total reads per cell
total_cov_per_cell <- rowSums(colSums(ntcountsArray))
## make the mutation calls per var
variant_calls <- lapply(seq_along(reference), function(pos) {
## check which position have at least MINREADS total reads in each cell
support <- apply(ntcountsArray[pos, , ] >= MINREADS, 2, sum)
## check which position have at least MINREADS total reads in each cell
## remove reference calls and sites where unknown N calls dominate - perhapse modify later
support <-
support[!names(support) %in% c(reference[pos], "N", "INS", "DEL", "-")]
## only keep variants showing up in at least MINCELL
candidates <- names(support)[support >= MINCELL]
## return null if no variants
if (length(candidates) == 0) {
NULL
} else {
total_cov_pos <- rowSums(ntcountsArray[pos, , ])
out_matrix <- sapply(candidates, function(mutid) {
## mutations have at least MINREADS reads and exist as MINFRAC % of the total coverage at this nt position
is_mut <-
ntcountsArray[pos, , mutid] >= MINREADS &
ntcountsArray[pos, , mutid] >= (MINFRAC * total_cov_pos)
## reference have at least MINREADS reads and exist as MINFRAC % of the total coverage at this nt position
is_ref <-
ntcountsArray[pos, , reference[pos]] >= MINREADS &
ntcountsArray[pos, , reference[pos]] >= (MINFRAC * total_cov_pos)
## label cell depending on mutation and reference allele detection
ifelse(is_mut,
ifelse(is_ref, "BOTH", "MUT"),
ifelse(is_ref, "WT", "DROP"))
})
colnames(out_matrix) <-
paste0(pos, reference[pos], ">", candidates)
return(out_matrix)
}
})
## merge the variant call matricies
variant_calls <- do.call(cbind, variant_calls)
## count number of cell that are mutant per patient
varcount.bypatient <- sapply(unique(patient), function(pa) {
as.integer(colSums(variant_calls[patient == pa, ] == "BOTH" |
variant_calls[patient == pa, ] == "MUT"))
})
row.names(varcount.bypatient) <- colnames(variant_calls)
## total cells per patient
patient.totalcells <-
as.integer(table(patient)[colnames(varcount.bypatient)])
names(patient.totalcells) <- colnames(varcount.bypatient)
## fraction of all cells containg said variant per patient
varcount.bypatient.fraction <-
t(t(varcount.bypatient) / patient.totalcells)
## find variants with at least MINCELLS.PATIENT in any patient AND at least MINFRAC.PATIENT % of all reads at a given position for that patient
filter <- apply(varcount.bypatient, 1, function(ptid) {
max(ptid) >= MINCELLS.PATIENT &
max(ptid) / patient.totalcells[which.max(ptid)] >= MINFRAC.PATIENT
})
## identify enriched variants that appear in at least MINCELLS.PATIENT and at a fraction of over MINFRAC.OTHER % of the maximum observed frequency of this variant in only a single patient
## CONSIDER CHANGING OR ADDING A SPECIFIC CUTOFF FRAC filter
singlepatient <-
filter &
apply(varcount.bypatient.fraction, 1, function(x)
sum(x >= MINFRAC.OTHER * max(x))) == 1 &
rowSums(varcount.bypatient >= MINCELLS.PATIENT) == 1
## identify enriched variants that appear in at least MINCELLS.PATIENT and at a fraction of over MINFRAC.OTHER % of the maximum observed frequency of this variant in more than one patient
multipatient <-
filter &
apply(varcount.bypatient.fraction, 1, function(x)
sum(x >= MINFRAC.OTHER * max(x))) > 1 &
rowSums(varcount.bypatient >= MINCELLS.PATIENT) > 1
## extract mutations that are unique to individual patients
mutation.bypatient <-
colnames(varcount.bypatient)[apply(varcount.bypatient[singlepatient, ], 1, which.max)]
## pull the variant cells
variant_calls_selected <- variant_calls[, singlepatient]
## extract the baseCounts for variants of interest
out <- lapply(unique(patient), function(ptid) {
##retrieve matrices of allele counts for patient specific variants
if (sum(mutation.bypatient == ptid) == 0) {
return(NULL)
}
MN <-
pullcountsVars(BaseCounts[patient == ptid], colnames(variant_calls_selected)[mutation.bypatient == ptid])
##create mutationCalls object
o <- mutationCallsFromMatrix(t(MN$M), t(MN$N))
})
names(out) <- unique(patient)
## extract mutations that are shared in one or more patient for exclusionlist
out$exclusionlist <- rownames(varcount.bypatient[multipatient, ])
out$exclusionlist <-
gsub("(\\d+)(.+)", "\\1 \\2", out$exclusionlist)
return(out)
}
#'Create a mutationCalls object from nucleotide base calls using a
#' exclusionlist (single individual)
#'
#'Identifies relevant mitochondrial somatic variants from raw counts
#' of nucleotide frequencies. Applies two sets of filters: In the
#' first step, filters on coverage and minimum allele frequency to
#' exclude potentially noisy variants; in the second step, filters
#' against a exclusionlist of variants that were observed in several
#' individuals and that therefore are unlikely to represent true
#' somatic variants (e.g. RNA editing events). These exclusionlists
#' are created using \code{\link{mutationCallsFromCohort}}
#'@param BaseCounts A list of base call matrices (one matrix per cell)
#'as produced by \code{\link{baseCountsFromBamList}}
#'@param lim.cov Minimal coverage required per cell for a cell to be
#'classified as covered
#'@param min.af Minimal allele frequency for a cell to be classified
#'as mutant
#'@param min.num.samples Minimal number of cells required to be
#'classified as covered and mutant according to the thresholds
#'set in \code{lim.cov} and \code{min.af}. Usually specified as a
#'fraction of the total number of cells.
#'@param universal.var.cells Maximum number of cells required to be
#'classified as mutant according to the threshold set in
#'\code{min.af.universal}. Usually specified as a fraction of
#'the total number of cells; serves to avoid e.g. germline
#'variants.
#'@param min.af.universal Minimal allele frequency for a cell to be
#'classified as mutant, in the context of removing universal
#'variants. Defaults to \code{min.af}, but can be set to lower
#'values.
#'@param exclusionlists.use List of sites to exclude for variants
#'calling. The default exclusionlists object included with this
#'package contains exclude or hardmask in GRanges format. The
#'four exclusionlists included in this case are: "three" (hg38
#'sites that are part of homopolymer(e.g. AAA) of at least 3 bp
#'in length), "mutaseq" (sites discovered to be overrepresented
#'in AML SmartSeq2 data analysis from Velten et al 2021),
#'"masked" (sites that are softmasked in either the UCSC or
#'Refseq genome annotations), and "rnaEDIT" which are sites that
#'are subjected to RNA-editing according to the REDIportal. These
#'lists can also be input manually by a researcher and provided
#'as either coordinates (as a string) or as a GRanges objects.
#'@param max.var.na Final filtering step: Remove all mutations with no
#'coverage in more than this fraction of cells
#'@param max.cell.na Final filtering step: Remove all cells with no
#'coverage in more than this fraction of mutations
#'@param genome The mitochondrial genome of the sample being
#'investigated. Please note that this is the UCSC standard
#'chromosome sequence. Default: hg38.
#'@param customDNA A character vector containing a custom DNA sequence. If
#'provided, this sequence will be used instead of the predefined options
#'specified by the `genome` parameter. Default is NULL.
#'@param ncores number of cores to use for tabulating potential
#'variants (defaults to 2)
#'@param ... Parameters passed to
#'\code{\link{mutationCallsFromMatrix}}
#'@return An object of class \code{\link{mutationCalls}}
#'@examples load(system.file("extdata/example_counts.Rda",package = "mitoClone2"))
#'Example <- mutationCallsFromExclusionlist(example.counts,
#' min.af=0.05, min.num.samples=5,
#' universal.var.cells = 0.5 * length(example.counts),
#' binarize = 0.1)
#'@export
mutationCallsFromExclusionlist <- function(BaseCounts,
lim.cov = 20,
min.af = 0.2,
min.num.samples = 0.01 * length(BaseCounts),
min.af.universal = min.af,
universal.var.cells = 0.95 * length(BaseCounts),
exclusionlists.use = exclusionlists,
max.var.na = 0.5,
max.cell.na = 0.95,
genome = 'hg38',
customDNA = NULL,
ncores = 1,
...) {
if (!is.null(customDNA)) {
# Use custom genome sequence if provided
mito.dna <- customDNA
} else {
mito.dna <- switch(genome,
"hg38" = hg38.dna,
"hg19" = hg19.dna,
"mm10" = mm10.dna)
}
if (!is.null(customDNA) | genome != 'hg38'){
print('Be aware! The provided exclusionlists are defined based on HUMAN GENOME hg38. Provide custom lists if this is not your genome.')
}
varaf <- parallel::mclapply(BaseCounts, function(x) {
## focus on A,G,C,T
x <- x[, c('A', 'T', 'C', 'G')]
## find cell that have less than 100 cov over agct at a given pos
zeroes <- rowSums(x) < lim.cov
## af calc
x.af <- x / rowSums(x)
## alleleRatio
##x.af <- x / (x+apply(x,1,max))
x.af <- reshape2::melt(x.af)
colnames(x.af) <- c('pos', 'nt', 'af')
## remove reference af's
x.af <- x.af[!(mito.dna[x.af$pos] == x.af$nt), ]
## remove N site
x.af <- x.af[!(mito.dna[x.af$pos] == 'N'), ]
x.af$name <- paste0(x.af$pos, ' ', mito.dna[x.af$pos], '>', x.af$nt)
## find dominant NT
x.af$af[x.af$pos %in% which(zeroes)] <- NA
x <- x.af$af
names(x) <- x.af$name
return(x)
}, mc.cores = ncores) #remove parallelism here
varaf <- do.call(cbind, varaf)
## you could allow for only sites with coverage!
## currently you filter at a rate of 10% cells dropping out max
##varaf <- varaf[rowSums(is.na(varaf))/length(BaseCounts) < max.fraction.na,]
varaf <-
varaf[rowSums(varaf > min.af, na.rm = TRUE) >= min.num.samples, ]
is.names <-
sapply(exclusionlists.use, function(x)
typeof(x) == "character")
#part 2 - filter based on the exclusionlist
if (sum(is.names) > 0) {
removal.names.list <- unique(unlist(exclusionlists.use[is.names]))
varaf <- varaf[!row.names(varaf) %in% removal.names.list, ]
}
if (sum(!is.names) > 0) {
removal.ranges.list <-
unique(unlist(GenomicRanges::GRangesList(exclusionlists.use[!is.names])))
varaf <-
varaf[-c(S4Vectors::queryHits(
GenomicRanges::findOverlaps(mut2GR(row.names(varaf)), removal.ranges.list)
)), ]
}
##if(drop.empty){
##colSums(varaf,na.rm=TRUE) > 0
##}
## remove empty rows
varaf <- varaf[rowSums(varaf, na.rm = TRUE) > 0, , drop = FALSE]
## remove variants that are at a certain universal af in a certain number of cells
varaf <-
varaf[!rowSums(varaf >= min.af.universal, na.rm = TRUE) >= universal.var.cells, ]
## vars must have less than X % NA's
varaf <- varaf[rowSums(is.na(varaf)) < max.var.na * NCOL(varaf), ]
## cells must have less than X % NA's
varaf <-
varaf[, colSums(is.na(varaf)) < max.cell.na * NROW(varaf)]
MN <- pullcountsVars(BaseCounts, rownames(varaf), colnames(varaf))
mutationCallsFromMatrix(t(MN$M), t(MN$N), ...)
}
#'Convert mutation string to GRanges
#'
#'@param mut The mutation to convert to a GRanges in the format of
#'"position reference>alternate".
#'@return Returns a GRanges object containg the site of the variant
#'along with reference/alternate allele data in the metacolumns
#'@examples mutation.as.granges <- mut2GR('1434 G>A')
#'@examples mutation.as.granges.no.space <- mut2GR('1434G>A')
#'@export
mut2GR <- function(mut) {
gr <-
GenomicRanges::GRanges(paste0('chrM:', as.numeric(gsub(
" *[A-Z].*", "", mut
))))
gr$ref <-
sapply(mut, function(x) {
unlist(strsplit(gsub("\\d+ *", "", x), ">"))[[1]]
})
gr$alt <-
sapply(mut, function(x) {
unlist(strsplit(gsub("\\d+ *", "", x), ">"))[[2]]
})
return(gr)
}
#'Pull variant counts
#'
#'@param BaseCounts A list of base call matrices (one matrix per cell)
#'as produced by \code{\link{baseCountsFromBamList}}
#'@param vars Character vector of variants to pull, in format 5643G>T
#'@param cells Character vector for cells to select, or NULL if all
#'cells from the input are to be used
#'@return A list with two entries, M (count table on the variant
#'allele) and N (count table on the reference allele)
#'@examples load(system.file("extdata/example_counts.Rda",package = "mitoClone2"))
#'known.variants <- c("9 T>C","12 G>A","13 G>A")
#'counts.known.vars <- pullcountsVars(example.counts, vars=known.variants)
#'@export
pullcountsVars <- function(BaseCounts, vars, cells = NULL) {
var.gr <- mut2GR(vars)
## subset cells of interest if appropriate
if (!is.null(cells)) {
message('Subsetting for specific cells...')
if (!all(cells %in% names(BaseCounts))) {
stop(
paste0(
'You are attempting to subset for ',
sum(!(cells %in% names(BaseCounts))),
'cell(s) that do not appear in your input BaseCounts!'
)
)
}
BaseCounts <- BaseCounts[names(BaseCounts) %in% cells]
}
## pull counts alt
M <- sapply(BaseCounts, function(cell) {
mapply(function(p, x)
cell[p, x],
GenomicRanges::start(var.gr),
S4Vectors::mcols(var.gr)$alt)
})
## pull total counts per position
N <- sapply(BaseCounts, function(cell) {
rowSums(cell[GenomicRanges::start(var.gr), c('A', 'G', 'C', 'T'), drop =
FALSE])
})
if (!is.matrix(M) | !is.matrix(N)) {
M <- matrix(M, ncol = length(M), dimnames = list(vars, names(M)))
N <- matrix(N, ncol = length(N), dimnames = list(vars, names(N)))
}
## maintain consistency with previous code for N by excluding the mutant allele calls
N <- N - M
rownames(M) <- row.names(N) <- vars
return(list(M = M, N = N))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.