Nothing
#' Make trinucleotides counts matrix from input data for a given reference genome.
#'
#' @examples
#' \dontrun{
#' data(ssm560_reduced)
#' library("BSgenome.Hsapiens.1000genomes.hs37d5")
#' trinucleotides_counts = import.trinucleotides.counts(data=ssm560_reduced,
#' reference=BSgenome.Hsapiens.1000genomes.hs37d5)
#' }
#'
#' @title import.trinucleotides.counts
#' @param data a data.frame with variants having 6 columns: sample name, chromosome, start position, end position, ref, alt.
#' @param reference a BSgenome object with the reference genome to be used to retrieve flanking bases.
#' @return A matrix with trinucleotides counts per patient.
#' @export import.trinucleotides.counts
#' @importFrom data.table data.table dcast .N
#' @importFrom Biostrings DNAStringSet complement reverseComplement subseq
#' @import GenomicRanges
#' @import IRanges
#' @import GenomeInfoDb
#' @importFrom BSgenome getSeq
#'
"import.trinucleotides.counts" <- function( data, reference = NULL ) {
# check that reference is a BSgenome object
if(is.null(reference)|class(reference)!="BSgenome") {
stop("The reference genome provided as input needs to be a BSgenome object.")
}
# preprocessing input data
data <- as.data.frame(data)
colnames(data) <- c("sample","chrom","start","end","ref","alt")
# consider only single nucleotide variants involving (A,C,G,T) bases
data <- data[which(data[,"start"]==data[,"end"]),,drop=FALSE]
data <- data[which(as.matrix(data[,"ref"])%in%c("A","C","G","T")),,drop=FALSE]
data <- data[which(as.matrix(data[,"alt"])%in%c("A","C","G","T")),,drop=FALSE]
data <- data[,c("sample","chrom","start","ref","alt"),drop=FALSE]
colnames(data) <- c("sample","chrom","pos","ref","alt")
data <- unique(data)
data <- data[order(data[,"sample"],data[,"chrom"],data[,"pos"]),,drop=FALSE]
# convert data to GRanges
data <- GRanges(data$chrom,IRanges(start=(data$pos-1),width=3),ref=DNAStringSet(data$ref),alt=DNAStringSet(data$alt),sample=data$sample)
# check that all chromosomes match reference
if(length(setdiff(seqnames(data),GenomeInfoDb::seqnames(reference)))>0) {
warning("Check chromosome names, not all match reference genome.")
}
# find context for each mutation
data$context <- getSeq(reference,data)
# check for any mismatch with BSgenome context
if(any(subseq(data$context,2,2)!=data$ref)) {
warning("Check reference bases, not all match context.")
}
# get complements and reverse complements
data$cref <- complement(data$ref)
data$calt <- complement(data$alt)
data$rccontext <- reverseComplement(data$context)
# identify trinucleotides motif
data$cat <- ifelse(data$ref%in%c("C","T"),paste0(subseq(data$context,1,1),"[",data$ref,">",data$alt,"]",subseq(data$context,3,3)),
paste0(subseq(data$rccontext,1,1),"[",data$cref,">",data$calt,"]",subseq(data$rccontext,3,3)))
# create 96 trinucleotides mutation categories
categories_context <- NULL
categories_alt <- rep(c(rep("C>A",4),rep("C>G",4),rep("C>T",4),rep("T>A",4),rep("T>C",4),rep("T>G",4)),4)
categories_cat <- NULL
cont <- 0
for(i in c("A","C","G","T")) {
for(j in 1:6) {
for(k in c("A","C","G","T")) {
cont <- cont + 1
categories_context <- c(categories_context,paste0(k,":",i))
categories_cat <- c(categories_cat,paste0(k,"[",categories_alt[cont],"]",i))
}
}
}
mutation_categories <- data.table::data.table(context=categories_context,alt=categories_alt,cat=categories_cat)
# count number of mutations per sample for each category
input1 <- data.table::data.table(mutation_categories[,"cat"])
input2 <- data.table::data.table(sample=data$sample,cat=data$cat)
input2 <- input2[,.N,by=.(sample,cat)]
data <- merge(input1,input2,by="cat",all=TRUE)
data <- data.table::dcast(data,sample~cat,value.var="N")
data <- data[!is.na(sample),drop=FALSE]
data[is.na(data)] <- 0
# make trinucleotides counts matrix
samples_names <- data$sample
data <- as.matrix(data[,2:ncol(data),drop=FALSE])
rownames(data) <- samples_names
data <- data[sort(rownames(data)),,drop=FALSE]
data <- data[,sort(colnames(data)),drop=FALSE]
trinucleotides_counts <- array(0,c(nrow(data),96))
rownames(trinucleotides_counts) <- rownames(data)
colnames(trinucleotides_counts) <- sort(as.character(mutation_categories$cat))
rows_contexts <- rownames(data)
cols_contexts <- colnames(trinucleotides_counts)[which(colnames(trinucleotides_counts)%in%colnames(data))]
trinucleotides_counts[rows_contexts,cols_contexts] <- data[rows_contexts,cols_contexts]
# return trinucleotides counts matrix
return(trinucleotides_counts)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.