#' Converts ICGC Simple Somatic Mutation format file to MAF
#'
#' @description Converts ICGC Simple Somatic Mutation format file to Mutation Annotation Format. Basic fields are converted as per MAF specififcations, rest of the fields are retained as in the input file.
#' Ensemble gene IDs are converted to HGNC Symbols. Note that by default Simple Somatic Mutation format contains all affected transcripts of a variant resuting in
#' multiple entries of the same variant in same sample. It is hard to choose a single affected transcript based on annotations alone and by default this program removes repeated variants as duplicated entries.
#' If you wish to keep all of them, set removeDuplicatedVariants to FALSE.
#' @details ICGC Simple Somatic Mutattion format specififcation can be found here: http://docs.icgc.org/submission/guide/icgc-simple-somatic-mutation-format/
#'
#' @param icgc Input data in ICGC Simple Somatic Mutation format. Can be gz compressed.
#' @param basename If given writes to output file with basename.
#' @param MAFobj If TRUE returns results as an \code{\link{MAF}} object.
#' @param clinicalData Clinical data associated with each sample/Tumor_Sample_Barcode in MAF. Could be a text file or a data.frame. Default NULL.
#' @param removeDuplicatedVariants removes repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. See Description. Default TRUE.
#' @param addHugoSymbol If TRUE replaces ensemble gene IDs with Hugo_Symbols. Default FALSE.
#' @return tab delimited MAF file.
#' @examples
#' esca.icgc <- system.file("extdata", "simple_somatic_mutation.open.ESCA-CN.sample.tsv.gz", package = "maftools")
#' esca.maf <- icgcSimpleMutationToMAF(icgc = esca.icgc)
#' @export
icgcSimpleMutationToMAF = function(icgc, basename = NA, MAFobj = FALSE, clinicalData = NULL, removeDuplicatedVariants = TRUE, addHugoSymbol = FALSE){
icgc = data.table::fread(file = icgc, sep = '\t', stringsAsFactors = FALSE, header = TRUE, showProgress = TRUE, data.table = TRUE, verbose = FALSE, fill = TRUE)
icgc.indels = icgc[consequence_type %in% 'frameshift_variant']
icgc = icgc[!consequence_type %in% 'frameshift_variant']
#Variant consequences
vep.vc = c("splice_acceptor_variant", "splice_donor_variant", "transcript_ablation",
"exon_loss_variant", "stop_gained", "stop_lost", "initiator_codon_variant",
"start_lost", "missense_variant", "coding_sequence_variant",
"conservative_missense_variant", "rare_amino_acid_variant", "transcript_amplification",
"intron_variant", "INTRAGENIC", "intragenic_variant", "splice_region_variant",
"incomplete_terminal_codon_variant", "synonymous_variant", "stop_retained_variant",
"NMD_transcript_variant", "mature_miRNA_variant", "exon_variant",
"non_coding_exon_variant", "non_coding_transcript_exon_variant",
"non_coding_transcript_variant", "nc_transcript_variant", "5_prime_UTR_variant",
"5_prime_UTR_premature_start_codon_gain_variant", "3_prime_UTR_variant",
"TF_binding_site_variant", "regulatory_region_variant", "regulatory_region",
"intergenic_variant", "intergenic_region", "upstream_gene_variant",
"downstream_gene_variant", "disruptive_inframe_deletion", "inframe_deletion", "inframe_insertion", "disruptive_inframe_insertion")
#Corresponding MAF Variant Classifications
maf.vc = c("Splice_Site", "Splice_Site", "Splice_Site", "Splice_Site",
"Nonsense_Mutation", "Nonstop_Mutation", "Translation_Start_Site",
"Translation_Start_Site", "Missense_Mutation", "Missense_Mutation",
"Missense_Mutation", "Missense_Mutation", "Intron", "Intron",
"Intron", "Intron", "Splice_Region", "Silent", "Silent", "Silent",
"Silent", "RNA", "RNA", "RNA", "RNA", "RNA", "RNA", "5'UTR",
"5'UTR", "3'UTR", "IGR", "IGR", "IGR", "IGR", "IGR", "5'Flank",
"3'Flank", "In_Frame_Del", "In_Frame_Del", "In_Frame_Ins", "In_Frame_Ins")
names(maf.vc) = vep.vc
#Add MAF Variant_Classification levels correspoding to VEP consequences
icgc$Variant_Classification = maf.vc[as.character(icgc$consequence_type)]
#icgc$Variant_Classification = suppressWarnings(as.character(factor(x = icgc$consequence_type, levels = vep.vc, labels = maf.vc)))
#Add MAF Variant_Classification levels of INDELS correspoding to VEP consequences
icgc.indels$Variant_Classification = ifelse(test = icgc.indels$consequence_type %in% "frameshift_variant" & icgc.indels$mutation_type %in% "insertion of <=200bp", yes = "Frame_Shift_Ins", no = "Frame_Shift_Del")
#bind everything
icgc = rbind(icgc, icgc.indels)
#Add MAF Variant_Type to correspoding VEP mutation_type
icgc$Variant_Type = suppressWarnings(as.character(factor(x = icgc$mutation_type, levels = c("deletion of <=200bp", "insertion of <=200bp", "single base substitution"),
labels = c("DEL", "INS", "SNP"))))
icgc.fields = colnames(icgc)
required.fields = c("gene_affected", "assembly_version", "chromosome", "chromosome_start", "chromosome_end", "chromosome_strand", "Variant_Classification", "Variant_Type",
"reference_genome_allele", "mutated_from_allele", "mutated_to_allele", "icgc_sample_id", "verification_status", "biological_validation_status", "sequencing_strategy",
"verification_platform")
icgc.rest.fields = icgc.fields[!icgc.fields %in% required.fields]
icgc.rest = icgc[,icgc.rest.fields, with = FALSE]
#Make MAF table
icgc.maf = data.table::data.table(Hugo_Symbol = icgc[,gene_affected], Entrez_Gene_Id = NA, Center = NA, NCBI_Build = icgc[,assembly_version], Chromosome = icgc[,chromosome],
Start_Position = icgc[,chromosome_start], End_Position = icgc[,chromosome_end], Strand = '+',
Variant_Classification = icgc[,Variant_Classification], Variant_Type = icgc[,Variant_Type], Reference_Allele = icgc[,reference_genome_allele],
Tumor_Seq_Allele1 = icgc[,mutated_from_allele], Tumor_Seq_Allele2 = icgc[,mutated_to_allele], dbSNP_RS = NA, dbSNP_Val_Status = NA,
Tumor_Sample_Barcode = icgc[,icgc_sample_id], Verification_Status = icgc[,verification_status], Mutation_Status = "Somatic",
Sequence_Source = icgc[,sequencing_strategy], Validation_Method = icgc[,verification_platform])
icgc.maf$Verification_Status = ifelse(test = icgc.maf$Verification_Status %in% "not tested", yes = 'Unknown', no = 'Verified')
#Attach rest of fields
icgc.maf = cbind(icgc.maf, icgc.rest)
if(addHugoSymbol){
#Change ensemble gene IDs into Hugo_Symbol
ens = system.file('extdata', 'ensGenes.txt.gz', package = 'maftools')
message('Converting Ensemble Gene IDs into HGNC gene symbols.')
ens = data.table::fread(file = ens, sep = '\t', stringsAsFactors = FALSE)
icgc.maf[,ens_id := Hugo_Symbol]
icgc.maf = merge(icgc.maf, ens, by.x = 'Hugo_Symbol', by.y = 'ens_id', all.x = TRUE)
icgc.maf[,Hugo_Symbol := hgnc_symbol]
icgc.maf[,Entrez_Gene_Id := Entrez]
icgc.maf[,hgnc_symbol := NULL]
icgc.maf[,Entrez := NULL]
message('Done! Original ensemble gene IDs are preserved under field name ens_id')
}
#Order according to Hugo_Symbol
icgc.maf = icgc.maf[order(Hugo_Symbol)]
#Validate
icgc.maf = validateMaf(maf = icgc.maf, isTCGA = FALSE, rdup = removeDuplicatedVariants)
if(!is.na(basename)){
write.table(x = icgc.maf, file = paste(basename, 'maf', sep='.'), quote = FALSE, row.names = FALSE, sep = '\t')
}
if(MAFobj){
if(length(unique(icgc.maf[,Tumor_Sample_Barcode])) < 2){
message('Too few samples to create MAF object. Returning MAF table.')
return(icgc.maf)
}else{
#Convert to factors.
icgc.maf$Tumor_Sample_Barcode = as.factor(x = as.character(icgc.maf$Tumor_Sample_Barcode))
icgc.maf$Variant_Classification = as.factor(x = as.character(icgc.maf$Variant_Classification))
icgc.maf$Variant_Type = as.factor(x = as.character(icgc.maf$Variant_Type))
message('Summarizing..')
silent = c("3'UTR", "5'UTR", "3'Flank", "Targeted_Region", "Silent", "Intron",
"RNA", "IGR", "Splice_Region", "5'Flank", "lincRNA", "De_novo_Start_InFrame", "De_novo_Start_OutOfFrame", "Start_Codon_Ins", "Start_Codon_SNP", "Stop_Codon_Del")
maf.silent = icgc.maf[Variant_Classification %in% silent]
icgc.maf = icgc.maf[!Variant_Classification %in% silent]
maf.summary = summarizeMaf(maf = icgc.maf, anno = clinicalData)
icgc.maf = MAF(nonSyn = icgc.maf, syn = maf.silent, clinicalData = clinicalData, verbose = TRUE)
# icgc.maf = MAF(data = icgc.maf, variants.per.sample = maf.summary$variants.per.sample, variant.type.summary = maf.summary$variant.type.summary,
# variant.classification.summary = maf.summary$variant.classification.summary, gene.summary = maf.summary$gene.summary,
# summary = maf.summary$summary, maf.silent = maf.silent, clinical.data = maf.summary$sample.anno)
}
}
return(icgc.maf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.