#' Generates a bedgraph from GRanges object in order to upload on to UCSC Genome browser
#'
#' @param index is used to index the bedfile(s)
#' @param outputfolder Location to store created bedfile(s)
#' @param fragments A \code{\link{GRanges}} object with strand and mapq metadata, such as that generated by \code{\link{bam2GRanges}}
#' @importFrom S4Vectors runValue
#' @author David Porubsky
#' @export
exportBedGraph <- function(index, outputfolder, fragments=NULL, col="200,100,10") {
## Insert chromosome for in case it's missing
insertchr <- function(gr) {
mask <- which(!grepl('chr', seqnames(gr)))
mcols(gr)$chromosome <- as.character(seqnames(gr))
mcols(gr)$chromosome[mask] <- sub(pattern='^', replacement='chr', mcols(gr)$chromosome[mask])
mcols(gr)$chromosome <- as.factor(mcols(gr)$chromosome)
return(gr)
}
## Write phased fragments to file
if (!is.null(fragments)) {
#get coverge values per genomic site
rle.fragments.cov <- coverage(fragments)
fragments.cov <- unlist( S4Vectors::runValue(rle.fragments.cov), use.names = FALSE ) #get values of coverage per genomic site
fragments.cov.ranges <- unlist( ranges(rle.fragments.cov), use.names = FALSE ) #get genomic ranges with certain value of coverage
fragments <- GenomicRanges::GRanges(seqnames = seqlevels(fragments), ranges = fragments.cov.ranges, cov = fragments.cov)
fragments <- insertchr(fragments)
savefile.fragments <- file.path(outputfolder, paste0(index, '_phased.bed.gz'))
savefile.fragments.gz <- gzfile(savefile.fragments, 'w')
header <- paste('track type=bedGraph name=', index,'_reads description=BedGraph_of_phasedReads_',index, ' visibility=full color=',col, sep="")
write.table(header, file=savefile.fragments.gz, row.names=FALSE, col.names=F, quote=FALSE, append=F, sep='\t')
if (length(fragments)>0) {
#bedG <- as.data.frame(fragments)[c('chromosome','start','end','cov')]
bedG <- as(fragments, "data.frame")[c('chromosome','start','end','cov')]
} else {
bedG <- data.frame(chromosome='chr1', start=1, end=1, cov=NA)
}
write.table(bedG, file=savefile.fragments.gz, row.names=FALSE, col.names=F, quote=FALSE, append=T, sep='\t')
close(savefile.fragments.gz)
}
}
#' Generates a VCF file from phased haplotypes
#'
#' @param index Unique identifier used to index analyzed individual/sample
#' @param outputfolder Location to store created VCF file(s)
#' @param phasedHap Data object containing phased haplotypes
#' @param positions Set of heterozygous SNV positions used for phasing obtained from an input VCF files.
#' @param bsGenome A \code{BSgenome} object which contains reference genome used to infer reference alleles.
#' @param ref.fasta A user defined reference FASTA file to extract reference allele for all SNV positions.
#' @param chromosome Name of the chromosome for which we want to export vcf file
#' @import BSgenome
#' @importFrom GenomeInfoDb seqlengths
#' @importFrom Rsamtools FaFile indexFa scanFaIndex
#' @inheritParams phaseHETinversion
#' @author David Porubsky
#' @export
exportVCF <- function(index=NULL, outputfolder=NULL, phasedHap=NULL, positions=NULL, bsGenome=NULL, ref.fasta=NULL, chromosome=NULL, assume.biallelic=FALSE) {
## Print VCF header
#savefile.vcf.gz <- gzfile(savefile.vcf, 'w')
savefile.vcf <- file.path(outputfolder, paste0(chromosome, '_phased.vcf'))
if (!is.null(bsGenome)) {
## Get chromosome length from bsgenome object
chr.len <- GenomeInfoDb::seqlengths(bsGenome)[seqnames(bsGenome) == chromosome]
reference <- paste0("##reference=", attributes(bsGenome)$pkgname)
} else if (!is.null(ref.fasta)) {
## Check if submitted fasta file is indexed
ref.fasta.idx <- paste0(ref.fasta, ".fai")
if (!file.exists(ref.fasta.idx)) {
message("Index file for '", ref.fasta, "' not available, indexing ...")
fa.idx <- Rsamtools::indexFa(file = ref.fasta)
}
## Get FASTA index
fa.idx <- Rsamtools::scanFaIndex(ref.fasta)
## Get chromosome length from reference fasta
chr.len <- GenomeInfoDb::seqlengths(fa.idx)[chromosome]
reference <- paste0("##reference=", ref.fasta)
} else {
## If both bsGenome as well as ref.fasta are missing ASSUME chromosome length is the max SNV position for a given chromosome.
chr.len <- max(end(positions))
reference <- paste0("##reference=", 'unknown')
}
fileformat <- "##fileformat=VCFv4.2"
date <- paste("##fileDate=",Sys.Date(),sep="")
source.alg <- "##source=StrandPhase_algorithm"
reference <- reference
phasing <- "##phasing=Strand-seq"
format1 <- "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
format2 <- "##FORMAT=<ID=Q1,Number=1,Type=Float,Description=\"Quality measure of allele 1 (1-entropy)*coverage\">"
format3 <- "##FORMAT=<ID=Q2,Number=1,Type=Float,Description=\"Quality measure of allele 2 (1-entropy)*coverage\">"
format4 <- "##FORMAT=<ID=P1,Number=1,Type=Float,Description=\"Probability value of allele 1\">"
format5 <- "##FORMAT=<ID=P2,Number=1,Type=Float,Description=\"Probability value of allele 2\">"
contig <- paste0("##contig=<ID=",chromosome, ",length=",chr.len,">")
cat(fileformat,date,source.alg,reference,phasing,format1,format2,format3,format4,format5,contig, sep = "\n", file=savefile.vcf, append=FALSE)
#cat("##contig=<ID=",chromosome, ",length=",chr.len,">", sep="", file=savefile.vcf, append=F)
cat("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t", index, "\n", sep = "", file=savefile.vcf, append=TRUE)
if (!is.null(phasedHap)) {
hap1.pos <- phasedHap$hap1.cons$pos
hap2.pos <- phasedHap$hap2.cons$pos
snv.pos <- sort(union(hap1.pos, hap2.pos))
names(hap1.pos) <- as(phasedHap$hap1.cons$bases, "vector")
names(hap2.pos) <- as(phasedHap$hap2.cons$bases, "vector")
hap1.gap <- setdiff(snv.pos, hap1.pos)
hap2.gap <- setdiff(snv.pos, hap2.pos)
names(hap1.gap) <- rep("", length(hap1.gap))
names(hap2.gap) <- rep("", length(hap2.gap))
hap1.alleles <- names(sort(c(hap1.pos, hap1.gap)))
hap2.alleles <- names(sort(c(hap2.pos, hap2.gap)))
#if (!grepl('chr', chromosome)) { chromosome <- sub(pattern='^', replacement='chr', chromosome) } #always add 'chr' if missing at the beginning of chromosome number
## Get phased SNV positions
snv.ranges <- GenomicRanges::GRanges(seqnames=chromosome, IRanges(start=snv.pos, end=snv.pos))
if (!is.null(bsGenome)) {
## Extract reference alleles from the reference bsgenome object if available
ref.alleles <- Biostrings::Views(bsGenome, snv.ranges)
ref.alleles <- as(ref.alleles, "DNAStringSet")
ref.alleles <- as(ref.alleles, "vector")
} else if (!is.null(ref.fasta)) {
## Extract SNV bases from user defined reference FASTA file
fa.file <- open(Rsamtools::FaFile(ref.fasta))
snv.seq <- Rsamtools::scanFa(file = fa.file, param = snv.ranges, as = "DNAStringSet")
names(snv.seq) <- NULL
ref.alleles <- as.character(snv.seq)
} else if (!is.null(positions) & 'ref' %in% colnames(mcols(positions))) {
ref.alleles <- positions[start(positions) %in% snv.pos]$ref
} else {
ref.alleles <- 'N'
}
names(hap1.pos) <- (1-phasedHap$hap1.cons$ent) * phasedHap$hap1.cons$cov
names(hap2.pos) <- (1-phasedHap$hap2.cons$ent) * phasedHap$hap2.cons$cov
names(hap1.gap) <- rep(".", length(hap1.gap))
names(hap2.gap) <- rep(".", length(hap2.gap))
hap1.qual <- names(sort(c(hap1.pos, hap1.gap)))
hap2.qual <- names(sort(c(hap2.pos, hap2.gap)))
names(hap1.pos) <- phasedHap$hap1.cons$score
names(hap2.pos) <- phasedHap$hap2.cons$score
names(hap1.gap) <- rep(".", length(hap1.gap))
names(hap2.gap) <- rep(".", length(hap2.gap))
hap1.score <- names(sort(c(hap1.pos, hap1.gap)))
hap2.score <- names(sort(c(hap2.pos, hap2.gap)))
alt.alleles1 <- rep("", length(snv.pos))
alt.alleles2 <- rep("", length(snv.pos))
alt.alleles <- rep("", length(snv.pos))
chr <- rep(chromosome, length(snv.pos))
id <- rep(".", length(snv.pos))
qual <- rep(".", length(snv.pos))
filter <- rep(".", length(snv.pos))
info <- rep(".", length(snv.pos))
format <- rep("GT:Q1:Q2:P1:P2", length(snv.pos))
alt.alleles1[ref.alleles != hap1.alleles & hap1.alleles != ""] <- hap1.alleles[ref.alleles != hap1.alleles & hap1.alleles != ""]
alt.alleles2[ref.alleles != hap2.alleles & hap2.alleles != ""] <- hap2.alleles[ref.alleles != hap2.alleles & hap2.alleles != ""]
hap1.alleles[alt.alleles2 == hap1.alleles & hap1.alleles != "" & alt.alleles1 != "" & alt.alleles2 != "" & alt.alleles1 != alt.alleles2] <- 2
hap2.alleles[alt.alleles2 == hap2.alleles & hap2.alleles != "" & alt.alleles1 != "" & alt.alleles2 != "" & alt.alleles1 != alt.alleles2] <- 2
hap1.alleles[ref.alleles == hap1.alleles] <- 0
hap2.alleles[ref.alleles == hap2.alleles] <- 0
hap1.alleles[alt.alleles1 == hap1.alleles & hap1.alleles != "" & alt.alleles1 != ""] <- 1
hap1.alleles[alt.alleles2 == hap1.alleles & hap1.alleles != "" & alt.alleles2 != ""] <- 1
hap2.alleles[alt.alleles1 == hap2.alleles & hap2.alleles != "" & alt.alleles1 != ""] <- 1
hap2.alleles[alt.alleles2 == hap2.alleles & hap2.alleles != "" & alt.alleles2 != ""] <- 1
hap1.alleles[hap1.alleles == ""] <- "."
hap2.alleles[hap2.alleles == ""] <- "."
## Fill gaps in haplotypes by opposing alleles assuming that all position are heterozygous
if (assume.biallelic) {
hap1.alleles[hap1.alleles == '.'] <- dplyr::recode(hap2.alleles[hap1.alleles == '.'], '0' = '1', '1' = '0')
hap2.alleles[hap2.alleles == '.'] <- dplyr::recode(hap1.alleles[hap2.alleles == '.'], '0' = '1', '1' = '0')
hap1.qual[hap1.qual == '.'] <- '0'
hap2.qual[hap2.qual == '.'] <- '0'
hap1.score[hap1.score == '.'] <- '0'
hap2.score[hap2.score == '.'] <- '0'
}
phased.alleles <- paste(hap1.alleles, hap2.alleles, sep="|")
genotypes <- paste(phased.alleles, hap1.qual, hap2.qual, hap1.score, hap2.score, sep=":")
collapse.alt <- function(X,Y) {
if (X==Y & X!="" & Y!="") {
return(X)
} else if (X!=Y & Y=="" & X!="") {
return(X)
} else if (X!=Y & Y!="" & X=="") {
return(Y)
} else if (X!=Y & Y!="" & X!="") {
return(paste(X,Y, sep=","))
} else {
return("")
}
}
alt.alleles <- mapply(collapse.alt, alt.alleles1, alt.alleles2, USE.NAMES = FALSE)
alt.alleles[alt.alleles == ""] <- "N"
snv.pos <- as.integer(format(snv.pos, scientific = FALSE)) #make sure no float numbers in the output
df <- data.frame(chr, snv.pos, id, ref.alleles, alt.alleles, qual, filter, info, format, genotypes, stringsAsFactors = FALSE)
utils::write.table(df, file=savefile.vcf, row.names=FALSE, col.names=FALSE, quote=FALSE, append=TRUE, sep='\t')
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.