#' vcf annotation with MORFEE
#'
#' @param myvcf_annot an ANNOVAR annotated VCF object to annotate with MORFEE
#' @param morfee_data data object obtained from get.morfee.data()
#'
#' @importFrom stats na.omit
#' @import foreach
#' @import VariantAnnotation
#' @import Biostrings
#' @import GenomicRanges
#'
#' @export
#'
morfee.annotation <- function(myvcf_annot, morfee_data){
myvcf_annot_info <- info(myvcf_annot)
myvcf_annot_info$MORFEE_uATG <- NA
myvcf_annot_info$MORFEE_uSTOP <- NA
myvcf_annot_header <- rbind(info(header(myvcf_annot)),
data.frame(Number = ".", Type = "String", Description = "New ATG annotation provided by MORFEE", stringsAsFactors = FALSE),
data.frame(Number = ".", Type = "String", Description = "Deletion STOP annotation provided by MORFEE", stringsAsFactors = FALSE) )
rownames(myvcf_annot_header) <- c(rownames(info(header(myvcf_annot))),"MORFEE_uATG", "MORFEE_uSTOP")
info(header(myvcf_annot)) <- myvcf_annot_header
info(myvcf_annot) <- myvcf_annot_info
i <- NULL
for(i in 1:nrow(myvcf_annot_info)){
my_func <- as.character(myvcf_annot_info[i,"Func.refGene"])
if(my_func!="UTR5"){
message(paste0("Skip variant ",i,": not in UTR5 region"))
next
}
my_refgene <- as.character(myvcf_annot_info[i,"GeneDetail.refGene"])
if(my_refgene=="."){
message(paste0("Skip variant ",i,": has no GeneDetail.refGene annotation"))
next
}
my_gene <- as.character(myvcf_annot_info[i,"Gene.refGene"])
my_nm_list <- parse_GeneDetail.refGene(my_refgene)
my_snp_pos_geno <- start(ranges(rowRanges(myvcf_annot)))[i]
my_chr <- as.character(seqnames(rowRanges(myvcf_annot))[i])
if(length(grep("chr",my_chr))<1){
my_chr <- paste0("chr",my_chr)
}
my_gencode_seque <- morfee_data[["GENCODE_SEQ"]][which(morfee_data[["GENCODE_SEQ_ORDER"]]==my_chr)]
# Loop for each transcript (row)
for(nm in 1:nrow(my_nm_list)){
if(!(as.character(my_nm_list[nm,5]) %in% c("A","T","C","G"))){
message(paste0("Skip variant ",i,": reference allele not A, T, C or G! ",my_nm_list[nm,5]))
# message(" it could be an indel, but not yet supported!")
next
}
if(!(as.character(my_nm_list[nm,6]) %in% c("A","T","C","G"))){
message(paste0("Skip variant ",i,": alternative allele not A, T, C or G! ",my_nm_list[nm,6]))
# message(" it could be an indel, but not yet supported!")
next
}
my_nm <- my_nm_list[nm,1]
my_snp <- my_nm_list[nm,2]
my_upordown <- my_nm_list[nm,3]
my_snp_pos_rel <- as.numeric(my_nm_list[nm,4]) # Position from ATG, ex: 94
my_nm_id <- grep(paste0(my_nm,"\\."), morfee_data[["GENCODE_METAD"]]$V2)
if(length(my_nm_id)==0){
message(paste0("Skip variant ",i,": annotation for ",my_nm_id," was not found in GENCODE"))
next
}
my_enst <- morfee_data[["GENCODE_METAD"]][my_nm_id,1][1]
if(morfee_data[["GRCh"]]==37){
my_transcript_id <- grep(paste0(my_enst,"_"), morfee_data[["GENCODE_ANNOT"]]$transcript_id)
}else if(morfee_data[["GRCh"]]==38){
my_transcript_id <- grep(my_enst, morfee_data[["GENCODE_ANNOT"]]$transcript_id)
}else{
stop("Reference Genome unknown")
}
gencode_annot_sub <- morfee_data[["GENCODE_ANNOT"]][my_transcript_id,]
gencode_annot_cds <- gencode_annot_sub[gencode_annot_sub$type=="CDS",]
gencode_annot_exon <- gencode_annot_sub[gencode_annot_sub$type=="exon",]
gencode_annot_transcript_type <- unique(gencode_annot_exon$transcript_type)
if(length(gencode_annot_transcript_type)<1){
message(paste0("Skip variant ",i,": transcript type is not protein coding"))
next
}else if(!(gencode_annot_transcript_type %in% "protein_coding")){
message(paste0("Skip variant ",i,": transcript type is not protein coding"))
next
}
my_init_codon_r <- gencode_annot_sub[gencode_annot_sub$type=="start_codon",]
if(nrow(my_init_codon_r)==0){
message(paste0("Skip variant ",i,": associated transcript without start_codon"))
next
}
my_init_codon_start <- as.numeric(my_init_codon_r[,"start"])[1] # Several start codon?
my_init_codon_end <- as.numeric(my_init_codon_r[,"end"])[1] # Several start codon?
my_stop_codon_r <- gencode_annot_sub[gencode_annot_sub$type=="stop_codon",]
if(nrow(my_stop_codon_r)==0){
message(paste0("Skip variant ",i,": associated transcript without stop_codon"))
next
}
my_stop_codon_start <- as.numeric(my_stop_codon_r[,"start"])[1] # Several stop codon?
my_stop_codon_end <- as.numeric(my_stop_codon_r[,"end"])[1] # Several stop codon?
if(my_init_codon_end < my_stop_codon_end){
my_init_codon_5 <- my_init_codon_start
my_init_codon_3 <- my_init_codon_end
my_stop_codon_5 <- my_stop_codon_start
my_stop_codon_3 <- my_stop_codon_end
}else{
my_init_codon_5 <- my_init_codon_end
my_init_codon_3 <- my_init_codon_start
my_stop_codon_5 <- my_stop_codon_end
my_stop_codon_3 <- my_stop_codon_start
}
########################################
# Test gene orientation
if(my_init_codon_end < my_stop_codon_end){
gencode_annot_exon <- gencode_annot_exon[order(as.numeric(gencode_annot_exon$exon_number), decreasing = FALSE),]
exons_length <- (gencode_annot_exon[,"end"]+1)-gencode_annot_exon[,"start"]
my_snp_exon <- which(gencode_annot_exon[,"start"] <= my_snp_pos_geno & gencode_annot_exon[,"end"] >= my_snp_pos_geno)
my_start_exon <- which(gencode_annot_exon[,"start"] <= my_init_codon_5 & gencode_annot_exon[,"end"] >= my_init_codon_5)
my_cdna_length_A <- my_cdna_length_B <- 0
if(length(my_snp_exon)==0){
message(paste0("Skip variant ",i,": not in an exonic part"))
next
}
# Reference sequence stats
for(exon_i in 1:nrow(gencode_annot_exon)){
if(exon_i==1){
my_cdna <- subseq(my_gencode_seque, start=gencode_annot_exon[exon_i,"start"], end=gencode_annot_exon[exon_i,"end"])[[1]]
}else{
my_cdna_i <- subseq(my_gencode_seque, start=gencode_annot_exon[exon_i,"start"], end=gencode_annot_exon[exon_i,"end"])[[1]]
my_cdna <- c(my_cdna, my_cdna_i)
}
# Calcul position of variant in cDNA
if(exon_i < my_snp_exon){
my_cdna_length_A <- my_cdna_length_A + exons_length[exon_i]
}else if(exon_i == my_snp_exon){
my_snp_pos_cdna <- my_cdna_length_A + ((my_snp_pos_geno+1) - gencode_annot_exon[exon_i,"start"])
}
# Calcul position of reference A(TG) in cDNA
if(exon_i < my_start_exon){
my_cdna_length_B <- my_cdna_length_B + exons_length[exon_i]
}else if(exon_i == my_start_exon){
my_init_codon_5_cdna <- my_cdna_length_B + ((my_init_codon_5 + 1) - gencode_annot_exon[exon_i,"start"])
}
}
# Find codon start in reference sequence
stats_orig <- matchPattern(morfee_data[["SEQ_INIT"]], my_cdna)
# Found all STOP in reference sequence
for(j in 1:length(morfee_data[["SEQ_STOP"]])){
stats_stop_orig_j <- matchPattern(morfee_data[["SEQ_STOP"]][[j]], my_cdna)
if(j==1){
stats_stop_orig <- stats_stop_orig_j
}else{
stats_stop_orig <- c(stats_stop_orig, stats_stop_orig_j)
}
}
my_ref_allele <- as.character(subseq(my_cdna, start=my_snp_pos_cdna, end=my_snp_pos_cdna))
if(my_ref_allele!=my_nm_list[nm,5]){
message(paste0("Skip variant ",i,": mismatch between alleles"))
next
}
my_ref_ATG <- as.character(subseq(my_cdna, start=my_init_codon_5_cdna, end=(my_init_codon_5_cdna+2)))
if(my_ref_ATG!="ATG"){
message(paste0("Skip variant ",i,": reference ATG no detected"))
next
}
# Replace my sequence with SNP
my_cdna_updated <- replaceLetterAt(my_cdna, my_snp_pos_cdna, my_nm_list[nm,6])
########################################
}else{
# message("Opposite orientation!")
gencode_annot_exon <- gencode_annot_exon[order(as.numeric(gencode_annot_exon$exon_number), decreasing = TRUE),]
exons_length <- (gencode_annot_exon[,"end"]+1)-gencode_annot_exon[,"start"]
my_snp_exon <- which(gencode_annot_exon[,"start"] <= my_snp_pos_geno & gencode_annot_exon[,"end"] >= my_snp_pos_geno)
my_start_exon <- which(gencode_annot_exon[,"start"] <= my_init_codon_5 & gencode_annot_exon[,"end"] >= my_init_codon_5)
my_cdna_length_A <- my_cdna_length_B <- 0
if(length(my_snp_exon)==0){
message(paste0("Skip variant ",i,": not in an exonic part"))
next
}
# Reference sequence stats
for(exon_i in 1:nrow(gencode_annot_exon)){
if(exon_i==1){
my_cdna <- subseq(my_gencode_seque, start=gencode_annot_exon[exon_i,"start"], end=gencode_annot_exon[exon_i,"end"])[[1]]
my_cdna <- reverse(complement(my_cdna))
}else{
my_cdna_i <- subseq(my_gencode_seque, start=gencode_annot_exon[exon_i,"start"], end=gencode_annot_exon[exon_i,"end"])[[1]]
my_cdna_i <- reverse(complement(my_cdna_i))
my_cdna <- c(my_cdna, my_cdna_i)
}
# Calcul position of variant in cDNA
if(exon_i < my_snp_exon){
my_cdna_length_A <- my_cdna_length_A + exons_length[exon_i]
}else if(exon_i == my_snp_exon){
my_snp_pos_cdna <- my_cdna_length_A + (gencode_annot_exon[exon_i,"end"] - (my_snp_pos_geno-1))
}
# Calcul position of reference A(TG) in cDNA
if(exon_i < my_start_exon){
my_cdna_length_B <- my_cdna_length_B + exons_length[exon_i]
}else if(exon_i == my_start_exon){
my_init_codon_5_cdna <- my_cdna_length_B + (gencode_annot_exon[exon_i,"end"] - (my_init_codon_3+1))
}
}
# Find codon start in reference sequence
stats_orig <- matchPattern(morfee_data[["SEQ_INIT"]], my_cdna)
# Found all STOP in reference sequence
for(j in 1:length(morfee_data[["SEQ_STOP"]])){
stats_stop_orig_j <- matchPattern(morfee_data[["SEQ_STOP"]][[j]], my_cdna)
if(j==1){
stats_stop_orig <- stats_stop_orig_j
}else{
stats_stop_orig <- c(stats_stop_orig, stats_stop_orig_j)
}
}
my_ref_allele <- as.character(subseq(my_cdna, start=my_snp_pos_cdna, end=my_snp_pos_cdna))
if(my_ref_allele!=my_nm_list[nm,5]){
message(paste0("Skip variant ",i,": mismatch between alleles"))
next
}
my_ref_ATG <- as.character(subseq(my_cdna, start=my_init_codon_5_cdna, end=(my_init_codon_5_cdna+2)))
if(my_ref_ATG!="ATG"){
message(paste0("Skip variant ",i,": reference ATG no detected"))
next
}
# Replace my sequence with SNP
my_cdna_updated <- replaceLetterAt(my_cdna, my_snp_pos_cdna, my_nm_list[nm,6])
}
########################################
# Find codon start in mutated sequence
stats_mut <- matchPattern(morfee_data[["SEQ_INIT"]], my_cdna_updated)
# Found all STOP in mutated sequence
for(j in 1:length(morfee_data[["SEQ_STOP"]])){
stats_stop_mut_j <- matchPattern(morfee_data[["SEQ_STOP"]][[j]], my_cdna_updated)
if(j==1){
stats_stop_mut <- stats_stop_mut_j
}else{
stats_stop_mut <- c(stats_stop_mut, stats_stop_mut_j)
}
}
# Comparer codon start in reference and mutated sequences
new.atg <- ranges(stats_mut)[!c(ranges(stats_mut) %in% ranges(stats_orig)) ,]
# Compare stop codons in reference and mutated sequences
del.stop <- ranges(stats_stop_orig)[!c(ranges(stats_stop_orig) %in% ranges(stats_stop_mut)) ,]
if(length(new.atg)>0){
message("New ATG detected!")
if(my_init_codon_end < my_stop_codon_end){
my.strand <- "forward"
}else{
my.strand <- "reverse"
}
new.atg.distance <- my_init_codon_5_cdna - (start(new.atg)[1])
test.frame.uatg <- (new.atg.distance%%3)
if(test.frame.uatg==0){
in.frame <- "in_frame"
}else{
in.frame <- paste0("out_of_frame_(",test.frame.uatg,")")
}
# Found all STOP
for(j in 1:length(morfee_data[["SEQ_STOP"]])){
my_stop_j <- matchPattern(morfee_data[["SEQ_STOP"]][[j]], my_cdna_updated[ start(range(new.atg)) : length(my_cdna_updated) ])
if(j==1){
my_stops <- my_stop_j
}else{
my_stops <- c(my_stops, my_stop_j)
}
}
# First stop in phase to the new codon start
my_stops_sort <- sort(start(ranges(my_stops)))
my_first_stop <- my_stops_sort[my_stops_sort%%3==1][1]
# Lenght of proteins
generated.prot.length <- (my_first_stop-1)/3
ref.prot.length <- (sum(gencode_annot_cds[,"end"]+1 - gencode_annot_cds[,"start"] ) -3)/3
if(is.na(my_first_stop)){
message(paste0("Skip variant ",i,": new ATG detected but no STOP in phase"))
next
}
# Determine whether the ORF is overlapping, not overlapping or elongated CD
if(my_first_stop < my_init_codon_5_cdna){
overlapping.prot <- "not_overlapping"
}else if( in.frame=="in_frame" & (my_first_stop > my_init_codon_5_cdna) ){
overlapping.prot <- "elongated_CDS"
}else{
overlapping.perc <- ((my_first_stop-my_init_codon_5_cdna)/(ref.prot.length*3))*100
overlapping.perc.round <- round(overlapping.perc, digits = 2)
overlapping.prot <- paste0("overlapping_",overlapping.perc.round,"%")
}
message( paste("For",my_gene,"-",my_nm,"and",my_snp))
message(paste0(" - New uATG detected at: ",new.atg.distance," from the main ATG!"))
message( paste(" - new uATG is",in.frame,"to the main ATG!"))
message( paste(" - new generated protein has a length of",generated.prot.length,"(aa) vs",ref.prot.length,"(aa)"))
# message(paste0(" - DEBUG: i=",i," ; nm=",nm))
message("\n\n")
# Update myvcf_annot_info
new_field <- paste( na.omit(c( myvcf_annot_info[i,"MORFEE_uATG"],
paste0(my_nm,":",my.strand,",",new.atg.distance,",",in.frame,",",overlapping.prot,",",generated.prot.length,"[/",ref.prot.length,"]","(aa)")) )
, collapse="|")
myvcf_annot_info[i,"MORFEE_uATG"] <- new_field
}# END new ATG
if(length(del.stop)>0){
# Use stats_orig, but could use stats_mut
uatg <- start(stats_orig)[ c(start(del.stop) - start(stats_orig)) > 0]
uatg_in_frame <- uatg[((start(del.stop) - uatg) %% 3)==0]
del.stop.distance <- my_init_codon_5_cdna - (start(del.stop)[1])
if(length(uatg_in_frame)>0){
if(my_init_codon_end < my_stop_codon_end){
my.strand <- "forward"
}else{
my.strand <- "reverse"
}
print( "STOP deletion detected!")
print( " - uSTOP deletion in ORF detected!")
print( paste("For",my_gene,"-",my_nm,"and",my_snp))
print(paste0(" - Deletion of a uSTOP codon detected at: ",-del.stop.distance," from the main ATG!"))
print( paste(" --- " , as.character( my_cdna[start(del.stop)[1]:end(del.stop)[1]] ),
" becomes ",as.character(my_cdna_updated[start(del.stop)[1]:end(del.stop)[1]] ) ))
print( paste(" --- Gene direction:",my.strand))
# print(paste0(" - DEBUG: i=",i," ; nm=",nm))
# several uATG could be present, so the protein length will be different
for(uatg_i in uatg_in_frame){
# uatg_i = uatg_in_frame[1]
# Find next stop in frame with uatg_i
uatg_i_in_frame <- start(stats_stop_mut)[ ((uatg_i - start(stats_stop_mut)) %%3)==0 ]
id_ustop <- uatg_i_in_frame > uatg_i
if(sum(id_ustop)>0){
first_new_stop <- min( uatg_i_in_frame[id_ustop] )
}else{
message(paste0("Skip variant ",i,": uATG detected but no STOP in phase"))
next
}
# Compute distance and length
stop.generated.prot.length <- (first_new_stop-uatg_i)/3
ref.prot.length <- (sum(gencode_annot_cds[,"end"]+1 - gencode_annot_cds[,"start"] ) -3)/3
uatg_used <- -(my_init_codon_5_cdna - uatg_i)
stop_used <- (my_init_codon_5_cdna - 1 - first_new_stop)
if(uatg_used>=0){
message(paste0("Skip variant ",i,": position of uATG is positive! Probably an error in the used reference database"))
next
}
if(stop_used<0){
# 1. uATG in frame to ref ATG
uatg_inframe_refatg <- ((uatg_i-my_init_codon_5_cdna)%%3)==0
# 2. STOP position used > ATG
stop_used_downstream <- (first_new_stop > my_init_codon_5_cdna)
if(uatg_inframe_refatg & stop_used_downstream){
overlapping.prot <- "elongated_CDS"
}else{
overlapping.perc <- (-stop_used/(ref.prot.length*3))*100
overlapping.perc.round <- round(overlapping.perc, digits = 2)
overlapping.prot <- paste0("overlapping_",overlapping.perc.round,"%")
}
}else{
overlapping.prot <- "not_overlapping"
}
stop.codon <- as.character(stats_stop_mut[start(stats_stop_mut)==first_new_stop])
test.frame.ustop <- ((my_init_codon_5_cdna-stop_used)%%3)
if(test.frame.ustop==0){
in.frame <- "in_frame"
}else{
in.frame <- paste0("out_of_frame_(",test.frame.ustop,")")
}
print( " --")
print( paste(" --- using uATG at",uatg_used,"to the main ATG!"))
print(paste0(" --- using STOP (",stop.codon,") at ",-stop_used," to the main ATG!"))
print( paste(" --- new predicted ORF has a length of",stop.generated.prot.length,"(aa) vs",ref.prot.length,"(aa) for the main protein"))
print( paste(" --- new predicted ORF is",overlapping.prot,"with the main protein"))
# Update myvcf_annot_info
new_field <- paste( na.omit(c( myvcf_annot_info[i,"MORFEE_uSTOP"],
paste0(my_nm,":",my.strand,",",-del.stop.distance,",",in.frame,",",overlapping.prot,",",stop.generated.prot.length,"[/",ref.prot.length,"]","(aa)")) )
, collapse="|")
myvcf_annot_info[i,"MORFEE_uSTOP"] <- new_field
}
cat("\n\n")
}else{
message( "STOP deletion detected!")
message( " - uSTOP deletion detected BUT without an upstream ATG (not in an ORF region)!")
message( paste("For",my_gene,"-",my_nm,"and",my_snp))
message(paste0(" - Deletion of a uSTOP codon detected at: ",-del.stop.distance," from the main ATG!"))
message( paste(" --- ", as.character( my_cdna[start(del.stop)[1]:end(del.stop)[1]] ),
" becomes ",as.character(my_cdna_updated[start(del.stop)[1]:end(del.stop)[1]] ) ))
message("\n\n")
}
}# END del STOP
}
}
info(myvcf_annot) <- myvcf_annot_info
return(myvcf_annot)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.