Nothing
################################################################################
##################### Public/Private Class Definitions #########################
#!!!!!!!!!!!!!!!!!!!!!!!!!!!!! Public Class !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!#
#' Class VEP
#'
#' An S4 class for Variant Effect Predictor input, under development!!!
#' @name VEP-class
#' @rdname VEP-class
#' @slot path Character string specifying the paths of the VEP files read in.
#' @slot version Numeric value specifying the version of VEP used.
#' @slot vepObject vep object which inherits from VEP_Virtual class.
#' @exportClass VEP
#' @include VEP_Virtual-class.R
#' @import methods
setClass("VEP",
representation=representation(path="character",
version="numeric",
vepObject="VEP_Virtual"))
#' Constructor for the VEP container class.
#'
#' @name VEP
#' @rdname VEP-class
#' @param path String specifying the path to a VEP annotation file. Can accept
#' wildcards if multiple VEP annotation files exist (see details).
#' @param data data.table object storing a GMS annotation file. Overrides "path"
#' if specified.
#' @param version String specifying the version of the VEP files, Defaults to
#' auto which will look for the version in the header.
#' @param verbose Boolean specifying if progress should be reported while
#' reading in the VEP files.
#' @details When specifying a path to a VEP annotation file the option exist to
#' either specify the full path to an annotation file or to use wildcards to
#' specify multiple files. When specifying a full path the initalizer will check
#' if a column named "sample" containg the relevant sample for each row exists.
#' If such a column is not found the initalizer will assume this file
#' corresponds to only one sample and populate a sample column accordingly.
#' Alternatively if multiple files are specified at once using a wildcard, the
#' initalizer will aggregate all the files and use the file names minus any
#' extension to populate sample names.
#' @seealso \code{\link{Waterfall}}
#' @seealso \code{\link{MutSpectra}}
#' @importFrom data.table fread
#' @importFrom data.table rbindlist
#' @importFrom data.table data.table
#' @export
VEP <- function(path, data=NULL, version="auto", verbose=FALSE){
message("This function is part of the new S4 feature and is under active development")
if(is.null(data)){
# Grab all files and assign to slot
vepFiles <- Sys.glob(path)
path <- vepFiles
# anonymous function to read in files
a1 <- function(x, verbose){
# detect OS and remove slashes and extension
if(.Platform$OS.type == "windows"){
sampleName <- gsub("(.*/)||(.*\\\\)", "", x)
sampleName <- gsub("\\.[^.]+$", "", sampleName)
} else {
sampleName <- gsub("(.*/)", "", x)
sampleName <- gsub("\\.[^.]+$", "", sampleName)
}
# read the header
header <- readLines(con=x, n=400)
header <- header[grepl("^##", header)]
# find where headers stop and read the data
skip <- length(header)
vepData <- suppressWarnings(data.table::fread(input=x,
stringsAsFactors=TRUE,
verbose=verbose,
skip=skip))
# set sample if it's not already in the data table
if(any(colnames(vepData) %in% "sample")){
return(vepData)
} else {
vepData$sample <- sampleName
return(list("data"=vepData, "header"=header))
}
}
# aggregate data into a single data table if necessary
if(length(vepFiles) == 0){
memo <- paste("No files found using:", path)
stop(memo)
} else {
# Read in the information
vepInfo <- lapply(vepFiles, a1, verbose)
# extract header and data information
vepHeader <- lapply(vepInfo, function(x) x[["header"]])
vepData <- lapply(vepInfo, function(x) x[["data"]])
# aggregate the data
vepData <- data.table::rbindlist(vepData, fill=TRUE)
}
} else if(is.data.table(data)){
path <- "none"
vepHeader <- data.table::data.table()
vepData <- data
} else {
memo <- paste("data is not of class data.table,",
"attempting to coerce")
warning(memo)
path <- "none"
vepHeader <- data.table::data.table()
vepData <- data.table::as.data.table(data)
}
# grab the version and assign it
if(length(version) > 1){
memo <- paste("user supplied version is not of length 1, using first element")
warning(memo)
version <- version[1]
}
a2 <- function(x){
# find the element which defines the VEP version
x <- x[grepl("VARIANT EFFECT PREDICTOR", x)]
# extract the version
x <- regmatches(x,regexpr("[0-9]+\\.*[0-9]*",x))
if(length(x) != 1){
memo <- paste("Expected 1 entry for VEP version, found:",
length(x), "using", as.numeric(x[1]))
warning(memo)
}
return(as.numeric(x[1]))
}
if(version == "auto"){
version <- lapply(vepHeader, a2)
version <- unique(unlist(version))
if(length(version) > 1){
version <- version[1]
memo <- paste("Expect 1 version, the following versions were",
"found:", toString(version), "Using version",
version, "for parsing!")
warning(memo)
} else if(length(version) == 0){
memo <- paste("Cannot infer version from vep headers",
"no versions found!")
stop(memo)
}
}
# assign the vepData to it's slot
if(version >= 88 & version <= 99.0){
vepObject <- VEP_v88(vepData=vepData, vepHeader=vepHeader)
} else {
memo <- paste("Currently only VEP version 88-99.0 are supported, make a",
"feature request on",
"https://github.com/griffithlab/GenVisR!")
stop(memo)
}
new("VEP", path=path, vepObject=vepObject, version=version)
}
################################################################################
###################### Accessor function definitions ###########################
#' @rdname writeData-methods
#' @aliases writeData
setMethod(f="writeData",
signature="VEP",
definition=function(object, file, ...){
writeData(object@vepObject, file, sep="\t")
})
#' @rdname getVersion-methods
#' @aliases getVersion
setMethod(f="getVersion",
signature="VEP",
definition=function(object, ...){
version <- object@version
return(version)
})
#' @rdname getPath-methods
#' @aliases getPath
setMethod(f="getPath",
signature="VEP",
definition=function(object, ...){
path <- object@path
return(path)
})
#' @rdname getHeader-methods
#' @aliases getHeader
setMethod(f="getHeader",
signature="VEP",
definition=function(object, ...){
header <- getHeader(object@vepObject)
return(header)
})
#' @rdname getDescription-methods
#' @aliases getDescription
setMethod(f="getDescription",
signature="VEP",
definition=function(object, ...){
description <- getDescription(object@vepObject)
return(description)
})
#' @rdname getPosition-methods
#' @aliases getPosition
setMethod(f="getPosition",
signature="VEP",
definition=function(object, ...){
positions <- getPosition(object@vepObject)
return(positions)
})
#' @rdname getMutation-methods
#' @aliases getMutation
setMethod(f="getMutation",
signature="VEP",
definition=function(object, ...){
mutations <- getMutation(object@vepObject)
return(mutations)
})
#' @rdname getSample-methods
#' @aliases getSample
setMethod(f="getSample",
signature="VEP",
definition=function(object, ...){
sample <- getSample(object@vepObject)
return(sample)
})
#' @rdname getMeta-methods
#' @aliases getMeta
setMethod(f="getMeta",
signature="VEP",
definition=function(object, ...){
meta <- getMeta(object@vepObject)
return(meta)
})
################################################################################
####################### Method function definitions ############################
#' @rdname setMutationHierarchy-methods
#' @aliases setMutationHierarchy
#' @noRd
#' @importFrom data.table data.table
#' @importFrom data.table setDT
#' @importFrom data.table rbindlist
#' @importFrom grDevices colors
setMethod(f="setMutationHierarchy",
signature="VEP",
definition=function(object, mutationHierarchy, verbose, ...){
# set the mutation hierarchy if a custom hierarchy is unspecified
if(is.null(mutationHierarchy)){
mutationHierarchy$mutation <- c("transcript_ablation", "splice_acceptor_variant",
"splice_donor_variant", "stop_gained",
"frameshift_variant", "stop_lost", "start_lost",
"transcript_amplification", "inframe_insertion",
"inframe_deletion", "missense_variant",
"protein_altering_variant", "splice_region_variant",
"incomplete_terminal_codon_variant", "stop_retained_variant",
"synonymous_variant", "coding_sequence_variant",
"mature_miRNA_variant", "5_prime_UTR_variant",
"3_prime_UTR_variant", "non_coding_transcript_exon_variant",
"intron_variant", "NMD_transcript_variant",
"non_coding_transcript_variant", "upstream_gene_variant",
"downstream_gene_variant", "TFBS_ablation",
"TFBS_amplification", "TF_binding_site_variant",
"regulatory_region_ablation", "regulatory_region_amplification",
"feature_elongation", "regulatory_region_variant",
"feature_truncation", "intergenic_variant")
mutationHierarchy$color <- c("#bd3d3d", "#d45a73", "#cd3300",
"#d24719", "#895878", "#e18466",
"#bd92cc", "#9e55a7", "#7d4281",
"#a6cea9", "#78b3d2", "#00a0b0",
"#4f372d", "#c6c386", "#739475",
"#ffdb86", "#ddb554", "#d69f35",
"#f98107", "#383838", "#5acb7f",
"#1e81b5", "#bcd9e8", "#ab763d",
"#a72ca2", "#f4a460", "#0000ff",
"#00510a", "#2a7694", "#00c9bb",
"#8c860e", "#687233", "#6f3333",
"#626161", "#db9294")
mutationHierarchy <- data.table::data.table("mutation"=mutationHierarchy$mutation,
"color"=mutationHierarchy$color)
}
# check that mutationHiearchy is a data table
if(!is(mutationHierarchy, "data.table")){
memo <- paste("mutationHiearchy is not an object of class",
"data.table, attempting to coerce.")
warning(memo)
mutationHierarchy <- data.table::setDT(mutationHierarchy)
}
# check for the correct columns
if(!all(colnames(mutationHierarchy) %in% c("mutation", "color"))){
missingCol <- colnames(mutationHierarchy)[!c("mutation", "color") %in% colnames(mutationHierarchy)]
memo <- paste("The correct columns were not found in",
"mutationHierarchy, missing", toString(missingCol))
stop(memo)
}
# don't want to add mutations from vep which are valid but are comma seperated on one line
# these are split up and taken care of in toWaterfall(), see below
consequences <- as.character(unique(object@vepObject@mutation$Consequence))
consequences <- consequences[grepl(",", consequences, fixed=TRUE)]
if(length(consequences) != 0){
consequencesSplit <- strsplit(consequences, ",", fixed=TRUE)
consequences <- consequences[sapply(consequencesSplit, function(x) all(x %in% mutationHierarchy$mutation))]
}
# check that all mutations are specified, if not add entries for them
if(!all(object@vepObject@mutation$Consequence %in% c(mutationHierarchy$mutation, consequences))){
missingMutations <- unique(object@vepObject@mutation$Consequence[!object@vepObject@mutation$Consequence %in% c(mutationHierarchy$mutation, consequences)])
memo <- paste("The following mutations were found in the",
"input however were not specified in the",
"mutationHierarchy!", toString(missingMutations),
"adding these in as least important and",
"assigning random colors!")
warning(memo)
newCol <- grDevices::colors(distinct=TRUE)[!grepl("^gray", grDevices::colors(distinct=TRUE))]
missingMutations <- data.table::data.table("mutation"=missingMutations,
"color"=sample(newCol, length(missingMutations)))
mutationHierarchy <- data.table::rbindlist(list(mutationHierarchy, missingMutations), use.names=TRUE, fill=TRUE)
}
# add in a pretty print mutation labels
mutationHierarchy$label <- gsub("_", " ", mutationHierarchy$mutation)
mutationHierarchy$label <- gsub("'", "' ", mutationHierarchy$label)
# check for duplicate mutations
if(any(duplicated(mutationHierarchy$mutation))){
duplicateMut <- mutationHierarchy[duplicated(mutationHierarchy$mutation),"mutation"]
memo <- paste("The mutation type",toString(duplicateMut),
"was duplicated in the supplied mutationHierarchy!")
mutationHierarchy <- mutationHierarchy[!duplicated(mutationHierarchy$mutation),]
}
# print status message
if(verbose){
memo <- paste("Setting the hierarchy of mutations from most",
"to least deleterious and mapping to colors:",
toString(mutationHierarchy$mutation))
message(memo)
}
return(mutationHierarchy)
})
#' @rdname toWaterfall-methods
#' @aliases toWaterfall
#' @importFrom data.table rbindlist
#' @noRd
setMethod(f="toWaterfall",
signature="VEP",
definition=function(object, hierarchy, labelColumn, verbose, ...){
# print status message
if(verbose){
memo <- paste("Converting", class(object),
"to expected waterfall format")
message(memo)
}
# grab the sample, mutation, gene columns and set a label and a flag
# to set the label
sample <- object@vepObject@sample
mutation <- object@vepObject@mutation[,"Consequence"]
gene <- object@vepObject@meta[,"SYMBOL"]
altGene <- object@vepObject@meta[,"Gene"]
label <- NA
labelFlag <- TRUE
# fill in gene values from the two gene colums we have
if(any(gene$SYMBOL %in% c("-"))){
index <- which(gene$SYMBOL %in% c("-"))
gene[index,] <- altGene[index,]
}
# if a label column exists and is proper overwrite the label variable
# if not change the flag
if(!is.null(labelColumn)){
if(length(labelColumn) != 1) {
memo <- paste("Parameter \"labelColumn\" must be of length 1!",
"Found length to be", length(labelColumn))
warning(memo)
labelFlag <- FALSE
}
if(any(!labelColumn %in% colnames(getMeta(object)))){
memo <- paste("Did not find column:", labelColumn,
"in the meta slot of the vepObject! Valid",
"names are:", toString(colnames(getMeta(object))))
warning(memo)
labelFlag <- FALSE
}
if(labelFlag){
label <- getMeta(object)[,labelColumn, with=FALSE]
}
}
# combine all columns into a consistent format
waterfallFormat <- cbind(sample, gene, mutation, label)
colnames(waterfallFormat) <- c("sample", "gene", "mutation", "label")
# make a temporary ID column for genomic features to collapse on
# this will ensure the mutation burden/frequency plot will be accurate
waterfallFormat$key <- paste0(object@vepObject@position$Location, ":",
object@vepObject@mutation$Allele, ":",
object@vepObject@sample$sample)
rowCountOrig <- nrow(waterfallFormat)
# seperate out any comma delimited mutation types not specified in the hierarchy
# if it's in the hierarchy we don't want to split it out later
index_multiConsequence <- (grepl(',', waterfallFormat$mutation, fixed=F) & !waterfallFormat$mutation %in% hierarchy$mutation)
waterfallFormat_multiConsequence <- waterfallFormat[index_multiConsequence,]
waterfallFormat_singleConsequence <- waterfallFormat[!index_multiConsequence,]
# status message
if(length(index_multiConsequence) > 0 & verbose){
memo <- paste("Found", length(index_multiConsequence), "rows with comma delimited mutations",
"which are valid. Splitting these into multiple entries and collapsing via the",
"specified hierarchy")
message(memo)
}
# cast data into form where mutation column is not comma delimited
# the hierarchy will sort out any duplicates
if(length(index_multiConsequence) > 0){
V1 <- . <- NULL # appease R CMD CHECK
waterfallFormat_multiConsequence <- waterfallFormat_multiConsequence[, strsplit(as.character(mutation), ",", fixed=TRUE),
by = .(sample, gene, label, mutation, key)][,.(mutation = V1, sample, gene, label, key)]
# combine everything back together
waterfallFormat <- data.table::rbindlist(list(waterfallFormat_multiConsequence, waterfallFormat_singleConsequence), use.names=TRUE, fill=TRUE)
} else {
waterfallFormat <- waterfallFormat_singleConsequence
}
# order the data based on the mutation hierarchy,
# remove all duplicates based on key, and remove the key column
waterfallFormat$mutation <- factor(waterfallFormat$mutation, levels=hierarchy$mutation)
waterfallFormat <- waterfallFormat[order(waterfallFormat$mutation),]
waterfallFormat <- waterfallFormat[!duplicated(waterfallFormat$key),]
waterfallFormat[,key:=NULL]
# print status message
if((rowCountOrig - nrow(waterfallFormat)) != 0){
memo <- paste("Removed", rowCountOrig - nrow(waterfallFormat),
"rows from the data which harbored duplicate",
"genomic locations")
warning(memo)
}
# if there are any genes which are NA, remove them
rowCountOrig <- nrow(waterfallFormat)
waterfallFormat <- waterfallFormat[!is.na(waterfallFormat$gene),]
if(rowCountOrig != nrow(waterfallFormat) & verbose){
memo <- paste("Removed", rowCountOrig - nrow(waterfallFormat),
"which had NA for gene the gene label")
message(memo)
}
# convert appropriate columns to factor
waterfallFormat$sample <- factor(waterfallFormat$sample)
return(waterfallFormat)
})
#' @rdname toMutSpectra-methods
#' @aliases toMutSpectra
#' @param object Object of class VEP
#' @param BSgenome Object of class BSgenome, used to extract reference bases if
#' not supplied by the file format.
#' @param verbose Boolean specifying if status messages should be reported
#' @importFrom Rsamtools getSeq
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges
#' @importFrom BSgenome available.genomes
#' @importFrom BSgenome installed.genomes
#' @importFrom data.table as.data.table
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom GenomeInfoDb seqnames
#' @noRd
setMethod(f="toMutSpectra",
signature="VEP",
definition=function(object, BSgenome, verbose, ...){
# print status message
if(verbose){
memo <- paste("Converting", class(object),
"to expected MutSpectra format")
message(memo)
}
# grab the BSgenome
if(is.null(BSgenome)){
BSgenome <- retrieve_BSgenome(object, verbose, ...)
}
# get an index of only the snvs
snvIndex <- which(object@vepObject@meta$VARIANT_CLASS == "SNV")
# status message
if(verbose){
memo <- paste("Removing", nrow(object@vepObject@sample)-length(snvIndex),
"entries which are not of class SNV!")
message(memo)
}
# grab the sample, mutation, position columns
sample <- object@vepObject@sample[snvIndex]
variantAllele <- object@vepObject@mutation[snvIndex,"Allele"]
position <- object@vepObject@position[snvIndex,"Location"]
# split the position into chr, start , stop
positionSplit <- lapply(as.character(position$Location), strsplit, ":", fixed=TRUE)
chr <- unlist(lapply(positionSplit, function(x) x[[1]][1]))
coord <- unlist(lapply(positionSplit, function(x) x[[1]][2]))
coord <- lapply(coord, strsplit, "-", fixed=TRUE)
start <- as.numeric(unlist(lapply(coord, function(x) x[[1]][1])))
stop <- as.numeric(unlist(lapply(coord, function(x) x[[1]][2])))
stop[is.na(stop)] <- start[is.na(stop)]
# combine everything into one GRanges object
variantGR <- GenomicRanges::GRanges(seqnames=chr, IRanges::IRanges(start=start, end=stop))
variantGR$sample <- sample$sample
variantGR$variantAllele <- toupper(variantAllele$Allele)
# check that the reference chromosomes match the input and BSgenome
seqMismatch <- unique(chr[!chr %in% seqnames(BSgenome)])
if(length(seqMismatch >= 1)){
memo <- paste("The following chromosomes do not match the BSgenome specified:", toString(seqMismatch))
warning(memo)
if(length(unique(chr[!paste0("chr", chr) %in% seqnames(BSgenome)])) == 0){
memo <- paste("appending \"chr\" to chromosomes to fix mismatch with the BSgenome")
warning(memo)
chr <- paste0("chr", chr)
GenomeInfoDb::seqlevels(variantGR) <- unique(chr)
GenomeInfoDb::seqnames(variantGR)[seq_along(variantGR)] <- chr
} else {
memo <- paste("removing entries with chromosomes not matching the BSgenome")
warning(memo)
variantGR_origSize <- length(variantGR)
variantGR <- variantGR[as.character(seqnames(variantGR)) %in% as.character(seqnames(BSgenome)),]
if(verbose){
memo <- paste("removed", variantGR_origSize - length(variantGR), "entries where chromosomes did",
"not match the BSgenome")
}
}
}
if(length(variantGR) == 0){
memo <- paste("There are no variants left after subsets.")
stop(memo)
}
# get the reference sequences
if(verbose){
memo <- paste("Annotating reference bases")
message(memo)
}
refAllele <- toupper(Rsamtools::getSeq(BSgenome, variantGR, as.character=TRUE))
# combine all columns into a consistent format and remove duplicate variants
variantGR$refAllele <- refAllele
mutSpectraFormat <- data.table::as.data.table(variantGR)
keep <- c("sample", "seqnames", "start", "end", "refAllele", "variantAllele")
mutSpectraFormat <- mutSpectraFormat[,keep, with=FALSE]
colnames(mutSpectraFormat) <- c("sample", "chromosome", "start", "stop", "refAllele", "variantAllele")
# unique, to make sure no duplicate variants exist to throw off the counts
rowCountOrig <- nrow(mutSpectraFormat)
mutSpectraFormat <- unique(mutSpectraFormat)
# print status message
if(verbose){
memo <- paste("Removed", rowCountOrig - nrow(mutSpectraFormat),
"rows from the data which harbored duplicate",
"genomic locations")
message(memo)
}
# convert appropriate columns to factor
mutSpectraFormat$sample <- factor(mutSpectraFormat$sample)
return(mutSpectraFormat)
})
#' @rdname toRainfall-methods
#' @aliases toRainfall
#' @param object Object of class VEP
#' @param BSgenome Object of class BSgenome, used to extract reference bases if
#' not supplied by the file format.
#' @param verbose Boolean specifying if status messages should be reported
#' @importFrom Rsamtools getSeq
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges
#' @importFrom BSgenome available.genomes
#' @importFrom BSgenome installed.genomes
#' @importFrom data.table as.data.table
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom GenomeInfoDb seqnames
#' @noRd
setMethod(f="toRainfall",
signature="VEP",
definition=function(object, BSgenome, verbose, ...){
# print status message
if(verbose){
memo <- paste("Converting", class(object),
"to expected Rainfall format")
message(memo)
}
# grab the BSgenome
if(is.null(BSgenome)){
BSgenome <- retrieve_BSgenome(object, verbose, ...)
}
# grab the sample, mutation, position columns
sample <- object@vepObject@sample
variantAllele <- object@vepObject@mutation[,"Allele"]
position <- object@vepObject@position[,"Location"]
# split the position into chr, start , stop
positionSplit <- lapply(as.character(position$Location), strsplit, ":", fixed=TRUE)
chr <- unlist(lapply(positionSplit, function(x) x[[1]][1]))
coord <- unlist(lapply(positionSplit, function(x) x[[1]][2]))
coord <- lapply(coord, strsplit, "-", fixed=TRUE)
start <- as.numeric(unlist(lapply(coord, function(x) x[[1]][1])))
stop <- as.numeric(unlist(lapply(coord, function(x) x[[1]][2])))
stop[is.na(stop)] <- start[is.na(stop)]
# combine everything into one GRanges object
variantGR <- GenomicRanges::GRanges(seqnames=chr, IRanges::IRanges(start=start, end=stop))
variantGR$sample <- sample$sample
variantGR$variantAllele <- toupper(variantAllele$Allele)
# check that the reference chromosomes match the input and BSgenome
seqMismatch <- unique(chr[!chr %in% seqnames(BSgenome)])
if(length(seqMismatch >= 1)){
memo <- paste("The following chromosomes do not match the BSgenome specified:", toString(seqMismatch))
warning(memo)
if(length(unique(chr[!paste0("chr", chr) %in% seqnames(BSgenome)])) == 0){
memo <- paste("appending \"chr\" to chromosomes to fix mismatch with the BSgenome")
warning(memo)
chr <- paste0("chr", chr)
GenomeInfoDb::seqlevels(variantGR) <- unique(chr)
GenomeInfoDb::seqnames(variantGR)[seq_along(variantGR)] <- chr
} else {
memo <- paste("removing entries with chromosomes not matching the BSgenome")
warning(memo)
variantGR_origSize <- length(variantGR)
variantGR <- variantGR[as.character(seqnames(variantGR)) %in% as.character(seqnames(BSgenome)),]
if(verbose){
memo <- paste("removed", variantGR_origSize - length(variantGR), "entries where chromosomes did",
"not match the BSgenome")
}
}
}
if(length(variantGR) == 0){
memo <- paste("There are no variants left after subsets.")
stop(memo)
}
# get the reference sequences
if(verbose){
memo <- paste("Annotating reference bases")
message(memo)
}
refAllele <- toupper(Rsamtools::getSeq(BSgenome, variantGR, as.character=TRUE))
# combine all columns into a consistent format and remove duplicate variants
variantGR$refAllele <- refAllele
rainfallFormat <- data.table::as.data.table(variantGR)
keep <- c("sample", "seqnames", "start", "end", "refAllele", "variantAllele")
rainfallFormat <- rainfallFormat[,keep, with=FALSE]
colnames(rainfallFormat) <- c("sample", "chromosome", "start", "stop", "refAllele", "variantAllele")
# remove cases where a mutation does not exist
rowCountOrig <- nrow(rainfallFormat)
rainfallFormat <- rainfallFormat[rainfallFormat$refAllele != rainfallFormat$variantAllele,]
if(rowCountOrig != nrow(rainfallFormat)){
memo <- paste("Removed", rowCountOrig - nrow(rainfallFormat),
"entries where the reference matches the variant")
warning(memo)
}
# unique, to make sure no duplicate variants exist to throw off the counts
rowCountOrig <- nrow(rainfallFormat)
rainfallFormat <- unique(rainfallFormat)
# print status message
if(verbose){
memo <- paste("Removed", rowCountOrig - nrow(rainfallFormat),
"rows from the data which harbored duplicate",
"genomic variants")
message(memo)
}
# make sure no duplicate genomic locations exist to throw off the mutation distance calculation
dupCoordIndex <- duplicated(rainfallFormat[,c("sample", "chromosome", "start")])
if(sum(dupCoordIndex) > 0){
rowCountOrig <- nrow(rainfallFormat)
rainfallFormat <- rainfallFormat[!dupCoordIndex,]
memo <- paste("Removed", rowCountOrig-nrow(rainfallFormat), "entries with identical",
"start coordinates")
warning(memo)
}
# create a flag column for where these entries came from
rainfallFormat$origin <- 'mutation'
# make sure everything is of the proper type
rainfallFormat$sample <- factor(rainfallFormat$sample, levels=unique(rainfallFormat$sample))
rainfallFormat$chromosome <- factor(rainfallFormat$chromosome, levels=unique(rainfallFormat$chromosome))
rainfallFormat$start <- as.integer(rainfallFormat$start)
rainfallFormat$stop <- as.integer(rainfallFormat$stop)
rainfallFormat$refAllele <- factor(rainfallFormat$refAllele, levels=unique(rainfallFormat$refAllele))
rainfallFormat$variantAllele <- factor(rainfallFormat$variantAllele, levels=unique(rainfallFormat$variantAllele))
return(rainfallFormat)
})
#' @rdname toLolliplot-methods
#' @aliases toLolliplot
#' @param object Object of class data.table
#' @param verbose Boolean specifying if status messages should be reported
#' @noRd
setMethod(f="toLolliplot",
signature="VEP",
definition=function(object, BSgenome, verbose, ...){
# print status message
if(verbose){
memo <- paste("converting", class(object), "to expected Lolliplot format")
message(memo)
}
# grab the BSgenome
if(is.null(BSgenome)){
BSgenome <- retrieve_BSgenome(object, verbose, ...)
}
# grab the sample, mutation, position columns
sample <- object@vepObject@sample
variantAllele <- object@vepObject@mutation[,"Allele"]
position <- object@vepObject@position[,"Location"]
# split the position into chr, start , stop
positionSplit <- lapply(as.character(position$Location), strsplit, ":", fixed=TRUE)
chr <- unlist(lapply(positionSplit, function(x) x[[1]][1]))
coord <- unlist(lapply(positionSplit, function(x) x[[1]][2]))
coord <- lapply(coord, strsplit, "-", fixed=TRUE)
start <- as.numeric(unlist(lapply(coord, function(x) x[[1]][1])))
stop <- as.numeric(unlist(lapply(coord, function(x) x[[1]][2])))
stop[is.na(stop)] <- start[is.na(stop)]
# combine everything into one GRanges object
variantGR <- GenomicRanges::GRanges(seqnames=chr, IRanges::IRanges(start=start, end=stop))
variantGR$sample <- sample$sample
variantGR$variantAllele <- toupper(variantAllele$Allele)
# check that the reference chromosomes match the input and BSgenome
seqMismatch <- unique(chr[!chr %in% seqnames(BSgenome)])
if(length(seqMismatch >= 1)){
memo <- paste("The following chromosomes do not match the BSgenome specified:", toString(seqMismatch))
warning(memo)
if(length(unique(chr[!paste0("chr", chr) %in% seqnames(BSgenome)])) == 0){
memo <- paste("appending \"chr\" to chromosomes to fix mismatch with the BSgenome")
warning(memo)
chr <- paste0("chr", chr)
GenomeInfoDb::seqlevels(variantGR) <- unique(chr)
GenomeInfoDb::seqnames(variantGR)[seq_along(variantGR)] <- chr
} else {
memo <- paste("removing entries with chromosomes not matching the BSgenome")
warning(memo)
variantGR_origSize <- length(variantGR)
variantGR <- variantGR[as.character(seqnames(variantGR)) %in% as.character(seqnames(BSgenome)),]
if(verbose){
memo <- paste("removed", variantGR_origSize - length(variantGR), "entries where chromosomes did",
"not match the BSgenome")
}
}
}
if(length(variantGR) == 0){
memo <- paste("There are no variants left after subsets.")
stop(memo)
}
# get the reference sequences
if(verbose){
memo <- paste("Annotating reference bases")
message(memo)
}
refAllele <- toupper(Rsamtools::getSeq(BSgenome, variantGR, as.character=TRUE))
# combine all columns into a consistent format
variantGR$refAllele <- refAllele
lolliplotFormat <- data.table::as.data.table(variantGR)
keep <- c("sample", "seqnames", "start", "end", "refAllele", "variantAllele")
lolliplotFormat <- lolliplotFormat[,keep, with=FALSE]
consequence <- object@vepObject@mutation[,"Consequence"]
gene <- object@vepObject@meta[,"Gene"]
lolliplotFormat <- cbind(lolliplotFormat, gene, consequence)
colnames(lolliplotFormat) <- c("sample", "chromosome", "start", "stop", "reference", "variant", "gene", "consequence")
# if amino acid position is available grab this as well so we don't rely on biomaRt to annotate
HGVSp <- object@vepObject@meta$HGCSp
label <- HGVSp
transcript <- object@vepObject@meta$Feature
valueCheck <- c(is.null(transcript), is.null(HGVSp))
valueCheckvalue <- c("transcript", "HGVSp")
if(any(valueCheck)){
missingIndex <- which(valueCheck == TRUE)
memo <- paste("Could not find the follwing required fields:", toString(valueCheckvalue[missingIndex]),
"unable to use file supplied protein positions, please contact developer to report error!")
warning(memo)
} else {
# combine everything
lolliplotFormat <- cbind(lolliplotFormat, transcript, HGVSp, label)
colnames(lolliplotFormat) <- c("sample", "chromosome", "start", "stop", "reference", "variant", "gene", "consequence", "transcript", "proteinCoord", "label")
# remove the pre-pended ensembl transcript from the label
lolliplotFormat$label <- gsub(".*(p\\..*)", "\\1", lolliplotFormat$label)
lolliplotFormat$proteinCoord <- gsub(".*(p\\..*)", "\\1", lolliplotFormat$proteinCoord)
# remove entries with NA in proteinCoord (these should be just intronic variants, non-transcript features, etc.)
remove <- !is.na(lolliplotFormat$proteinCoord)
lolliplotFormat <- lolliplotFormat[remove,]
}
# return the object
return(lolliplotFormat)
})
#' @rdname retrieve_BSgenome-methods
#' @aliases retrieve_BSgenome
#' @param object Object of class VEP
#' @param BSgenome Object of class BSgenome, used to extract reference bases if
#' not supplied by the file format.
#' @param verbose Boolean specifying if status messages should be reported
#' @importFrom Rsamtools getSeq
#' @importFrom IRanges IRanges
#' @importFrom GenomicRanges GRanges
#' @importFrom BSgenome available.genomes
#' @importFrom BSgenome installed.genomes
#' @importFrom data.table as.data.table
#' @importFrom GenomeInfoDb seqlevels
#' @importFrom GenomeInfoDb seqnames
#' @noRd
setMethod(f="retrieve_BSgenome",
signature="VEP",
definition=function(object, verbose, ...){
# grab the BSgenome
if(verbose){
memo <- paste("Looking for correct genome for reference base annotation.")
message(memo)
}
# look for assembly version in header
header <- object@vepObject@header
header <- header$Info[grepl("assembly", header$Info)]
header <- regmatches(header,regexpr("\\w+(\\d)+", header))
if(length(header) != 1){
memo <- paste("Unable to infer assembly from VEP header,",
"please use the BSgenome parameter!")
stop(memo)
}
# determine if a genome is available
availableGenomes <- BSgenome::available.genomes()
availableGenomes <- availableGenomes[grepl(header, availableGenomes)]
if(length(availableGenomes) == 0){
memo <- paste("Could not find a compatible BSgenome for", toString(header),
"Please specify the bioconductor BSgenome to annotate references bases!")
stop(memo)
}
# determine if the available genome in an installed package
installedGenomes <- BSgenome::installed.genomes()
installedGenomes <- installedGenomes[installedGenomes == availableGenomes]
if(length(installedGenomes) == 0){
memo <- paste("The BSgenome", toString(availableGenomes), "is available",
"but is not installed! Please install", toString(availableGenomes),
"via bioconductor!")
stop(memo)
}
# grab the genome
BSgenome <- installedGenomes[1]
if(verbose){
memo <- paste("attempting to use", toString(BSgenome), "to annotate reference bases!")
}
requireNamespace(BSgenome)
BSgenome <- getExportedValue(BSgenome, BSgenome)
return(BSgenome)
})
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.