#' Converts annovar annotations into MAF.
#'
#' @description Converts variant annotations from Annovar into a basic MAF.
#' @details Annovar is one of the most widely used Variant Annotation tools in Genomics. Annovar output is generally in a tabular format with various annotation columns.
#' This function converts such annovar output files into MAF. This function requires that annovar was run with gene based annotation as a first operation, before including
#' any filter or region based annotations. Please be aware that this function performs no transcript prioritization.
#'
#' e.g,
#' table_annovar.pl example/ex1.avinput humandb/ -buildver hg19 -out myanno -remove -protocol (\code{refGene}),cytoBand,dbnsfp30a -operation (\code{g}),r,f -nastring NA
#'
#' This function mainly uses gene based annotations for processing, rest of the annotation columns from input file will be attached to the end of the resulting MAF.
#' @param annovar input annovar annotation file. Can be vector of multiple files.
#' @param Center Center field in MAF file will be filled with this value. Default NA.
#' @param refBuild NCBI_Build field in MAF file will be filled with this value. Default hg19.
#' @param tsbCol column name containing Tumor_Sample_Barcode or sample names in input file.
#' @param table reference table used for gene-based annotations. Can be 'ensGene' or 'refGene'. Default 'refGene'
#' @param ens2hugo If `table` is `ensGene`, setting this argument to `TRUE` converts all ensemble IDs to hugo symbols.
#' @param basename If provided writes resulting MAF file to an output file.
#' @param sep field seperator for input file. Default tab seperated.
#' @param MAFobj If TRUE, returns results as an \code{\link{MAF}} object.
#' @param sampleAnno annotations associated with each sample/Tumor_Sample_Barcode in input annovar file. If provided it will be included in MAF object. Could be a text file or a data.frame. Ideally annotation would contain clinical data, survival information and other necessary features associated with samples. Default NULL.
#' @references Wang, K., Li, M. & Hakonarson, H. ANNOVAR: functional annotation of genetic variants from high-throughput sequencing data. Nucleic Acids Res 38, e164 (2010).
#' @return MAF table.
#' @examples
#' var.annovar <- system.file("extdata", "variants.hg19_multianno.txt", package = "maftools")
#' var.annovar.maf <- annovarToMaf(annovar = var.annovar, Center = 'CSI-NUS', refBuild = 'hg19',
#' tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene')
#' @export
annovarToMaf = function(annovar, Center = NULL, refBuild = 'hg19', tsbCol = NULL, table = 'refGene', ens2hugo = TRUE, basename = NULL , sep = '\t', MAFobj = FALSE, sampleAnno = NULL){
start_time = proc.time()
cat("-Reading annovar files\n")
ann = lapply(annovar, data.table::fread, colClasses = 'character', sep = sep, stringsAsFactors = FALSE,
fill = TRUE, header=TRUE, skip = "Chr")
names(ann) = unlist(data.table::tstrsplit(x = basename(annovar), split = "\\.", keep = 1))
ann = data.table::rbindlist(l = ann, fill = TRUE, idcol = "sample_id", use.names = TRUE)
#Check to see if input file contains sample names
if(is.null(tsbCol)){
if(! 'Tumor_Sample_Barcode' %in% colnames(ann)){
colnames(ann)[which(colnames(ann) == "sample_id")] = 'Tumor_Sample_Barcode'
cat("--Tumor_Sample_Barcode column not found. Creating sample IDs from filenames\n")
}
}else{
colnames(ann)[which(colnames(ann) == tsbCol)] = 'Tumor_Sample_Barcode'
}
#Table options. See here: http://annovar.openbioinformatics.org/en/latest/user-guide/download/ (not considering UCSC known genes options for now)
table = match.arg(arg = table, choices = c('refGene', 'ensGene'))
if(table == 'ensGene'){
colnames(ann)[which(colnames(ann) == 'Func.ensGene')] = 'Func.refGene'
colnames(ann)[which(colnames(ann) == 'Gene.ensGene')] = 'Gene.refGene'
colnames(ann)[which(colnames(ann) == 'ExonicFunc.ensGene')] = 'ExonicFunc.refGene'
colnames(ann)[which(colnames(ann) == 'AAChange.ensGene')] = 'AAChange.refGene'
colnames(ann)[which(colnames(ann) == 'GeneDetail.ensGene')] = 'GeneDetail.refGene'
}
essential.col = c('Chr', 'Start', 'End', 'Ref', 'Alt', 'Func.refGene', 'Gene.refGene', 'GeneDetail.refGene',
'ExonicFunc.refGene', 'AAChange.refGene')
#Change column names to standard names;
for(i in 1:length(essential.col)){
colId = suppressWarnings(grep(pattern = paste0('^', essential.col[i], '$'), x = colnames(ann), ignore.case = TRUE))
if(length(colId) == 1){
colnames(ann)[colId] = essential.col[i]
}
}
if(length(essential.col[!essential.col %in% colnames(ann)]) > 0) {
message('Available fields:')
print(colnames(ann))
message(paste0('Missing required field in input file: '))
print(essential.col[!essential.col %in% colnames(ann)])
stop()
}
#Center
if(is.null(Center)){
Center = NA
}
#If multiple genes are assigned, used the first entry (which correpsonds to closest gene)
ann[, Hugo_Symbol := unlist(data.table::tstrsplit(Gene.refGene, split = ";", keep = 1))]
#Annovar to MAF mappings (http://annovar.openbioinformatics.org/en/latest/user-guide/gene/)
annovar_values = c(
exonic = "RNA",
splicing = "Splice_Site",
UTR5 = "5'UTR",
UTR3 = "3'UTR",
intronic = "Intron",
upstream = "5'Flank",
downstream = "3'Flank",
intergenic = "IGR",
`frameshift insertion` = "Frame_Shift_Ins",
`frameshift deletion` = "Frame_Shift_Del",
`frameshift block substitution` = "Frameshift_INDEL",
`frameshift substitution` = "Frameshift_INDEL",
stopgain = "Nonsense_Mutation",
stoploss = "Nonstop_Mutation",
startloss = "Translation_Start_Site",
startgain = "Unknown", #Can not properly map MAF classification. Setting it to Unknown
`nonframeshift insertion` = "In_Frame_Ins",
`nonframeshift deletion` = "In_Frame_Del",
`nonframeshift block substitution` = "Inframe_INDEL",
`nonframeshift substitution` = "Inframe_INDEL",
`nonsynonymous SNV` = "Missense_Mutation",
`synonymous SNV` = "Silent",
unknown = "Unknown",
ncRNA_exonic = "RNA",
ncRNA_intronic = "RNA",
ncRNA_UTR3 = "RNA",
ncRNA_UTR5 = "RNA",
ncRNA = "RNA",
ncRNA_splicing = "RNA"
)
#Choose first entry for mixed annotations (e.g; exonic;splicing)
ann[, Func.refGene := unlist(data.table::tstrsplit(x = as.character(Func.refGene), split = ";", keep = 1))]
ann[, ExonicFunc.refGene := unlist(data.table::tstrsplit(x = as.character(ExonicFunc.refGene), split = ";", keep = 1))]
ann$Variant_Classification = ifelse(is.na(ann$ExonicFunc.refGene), annovar_values[ann$Func.refGene], annovar_values[ann$ExonicFunc.refGene])
if(nrow(ann[!is.na(AAChange.refGene)]) > 0){
cat("--Extracting tx, exon, txchange and aa-change\n")
ann_change = ann[!is.na(AAChange.refGene)]
aa_change = unlist(data.table::tstrsplit(x = as.character(ann_change$AAChange.refGene),split = ',', fixed = TRUE, keep = 1))
aa_tbl = lapply(aa_change, function(x){
x = unlist(strsplit(x = x, split = ":", fixed = TRUE))
if(length(x) == 5){
#column contains the gene name, the transcript identifier, exon, tx-change, aa-change
#If these entries are missing, fill them with NAs
tx = x[2]
exon = x[3]
txChange = x[4]
aaChange = x[5]
}else{
tx = NA
exon = NA
txChange = NA
aaChange = NA
}
data.table::data.table(tx, exon, txChange, aaChange)
})
aa_tbl = data.table::rbindlist(l = aa_tbl)
if(length(aa_change) != nrow(ann_change)){
stop("Something went wrong while parsing aa-change")
}
ann_change = cbind(ann_change, aa_tbl)
ann = data.table::rbindlist(l = list(ann_change, ann[is.na(AAChange.refGene)]), use.names = TRUE, fill = TRUE)
}
#Add Variant-type annotations based on difference between ref and alt alleles
cat("-Adding Variant_Type\n")
ann$Variant_Type = ifelse(endsWith(x = ann$Variant_Classification, suffix = "_Del"), yes = "DEL", no = NA)
ann$Variant_Type = ifelse(endsWith(x = ann$Variant_Classification, suffix = "_Ins"), yes = "INS", no = ann$Variant_Type)
ann$Variant_Type = ifelse(ann$Variant_Classification %in% c("Missense_Mutation", "Silent") , yes = "SNP", no = ann$Variant_Type)
ann[,ref_alt_len := nchar(Ref) + nchar(Alt)]
ann[,ref_alt_diff := nchar(Ref) - nchar(Alt)]
ann$Variant_Type = ifelse(ann$ref_alt_diff < 0 , yes = "INS", no = ann$Variant_Type)
ann$Variant_Type = ifelse(ann$ref_alt_diff > 0 , yes = "DEL", no = ann$Variant_Type)
#For ambiguous variants such as DNPs, TNPs and ONPs
if(nrow(ann[is.na(Variant_Type)]) > 0){
ann[, ref_alt := paste(Ref, Alt, sep = ">")]
ann_na = ann[is.na(Variant_Type)]
ann_na$Variant_Type = ifelse(test = ann_na$ref_alt %in% paste("-", c("A", "T", "G", "C"), sep = ">"), yes = "INS",
no = ifelse(test = ann_na$ref_alt %in% paste(c("A", "T", "G", "C"), "-", sep = ">"), yes = "DEL",
no = ifelse(ann_na$ref_alt_len == 2, yes = "SNP", no = ifelse(test = ann_na$ref_alt_len == 4, yes = "DNP",
no = "ONP"))))
ann[, ref_alt := NULL]
ann = data.table::rbindlist(l = list(ann[!is.na(Variant_Type)], ann_na), use.names = TRUE, fill = TRUE)
}
#Annovar ensGene doesn't provide HGNC gene symbols as Hugo_Symbol. We will change them manually.
if(table == 'ensGene'){
if(ens2hugo){
ens = system.file('extdata', 'ensGenes.txt.gz', package = 'maftools')
cat('-Converting Ensemble Gene IDs into HGNC gene symbols\n')
ens = data.table::fread(file = ens, sep = '\t', stringsAsFactors = FALSE)
ann = merge(ann, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = TRUE)
ann[,ens_id := Hugo_Symbol] #Backup original ids
ann[,Hugo_Symbol := hgnc_symbol] #Add GHNC gene names
ann[,Entrez_Gene_Id := Entrez] #Add entrez identifiers.
cat('--Done. Original ensemble gene IDs are preserved under field name ens_id\n')
}
}
ann[, ref_alt_diff := NULL]
ann[, ref_alt_len := NULL]
#MAF requires Hugo_Symbols to be set to Unknown in case of IGR mutations
ann$Hugo_Symbol = ifelse(ann$Variant_Classification == "IGR", yes = "Unknown", no = ann$Hugo_Symbol)
ann$Hugo_Symbol = ifelse(is.na(ann$Hugo_Symbol), yes = "Unknown", no = ann$Hugo_Symbol)
#Re-roganize columns
colnames(ann)[match(c("Chr", "Start", "End", "Ref", "Alt"), colnames(ann))] = c("Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2")
ord1 = colnames(x = ann)[colnames(x = ann) %in% c("Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode", "tx", "exon", "txChange", "aaChange")]
ord2 = colnames(x = ann)[!colnames(x = ann) %in% c("Hugo_Symbol", "Chromosome", "Start_Position", "End_Position", "Reference_Allele", "Tumor_Seq_Allele2", "Variant_Classification", "Variant_Type", "Tumor_Sample_Barcode", "tx", "exon", "txChange", "aaChange")]
ann = ann[,c(ord1, ord2), with = FALSE]
if(!is.null(basename)){
data.table::fwrite(x = ann, file = paste(basename, 'maf', sep='.'), sep='\t')
}
cat("Finished in",data.table::timetaken(start_time),"\n")
if(MAFobj){
m = read.maf(maf = ann, clinicalData = sampleAnno, verbose = FALSE)
return(m)
}else{
return(ann)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.