# This is an alternative to using seqminer for annotations
# Instead of relying on a large annotation file, and seqminer,
# we fetch gene coordinates from bioMart and find whether the given list
# of positions are covered by any of them
#' biomaRt genes
#'
#' Extract gene informations from biomaRt
#'
#' @param release A \code{character}. Genome release name. Valid releases are
#' "hg19", "hg38".
#' @param chr A \code{character} or \code{NULL}. If specified, only the genes
#' from the specified chromosomes will be returned.
#' @param mirror A \code{character}. Passed to \code{useEnsembl}.
#'
#' @return A \code{data.table} with the gene symbol, start and end position
#' and matching ensembl transcripts.
#'
#' @examples
#' #Chromosome X, hg19
#' egX <- mart_genes("hg19", mirror = "useast")
#' #Full genome, latest release
#' eg <- mart_genes("hg38", mirror = "useast")
#'
#' @importFrom biomaRt useMart getBM useEnsembl
#' @export
mart_genes <- function(release = "hg38", chr = "X", mirror = NULL){
release <- tolower(release)
if(release %in% c("grch38", "hg38", "latest")){
mart <- useEnsembl(biomart = "ensembl", mirror = mirror, dataset = "hsapiens_gene_ensembl")
#mart <- useMart(biomart="ensembl", dataset="hsapiens_gene_ensembl")
message("Fetching genes for hg38")
} else if(release %in% c("grch37", "hg19")){
mart <- useEnsembl(biomart = "ensembl", mirror = mirror, GRCh = "37", dataset = "hsapiens_gene_ensembl")
#mart <- useMart(biomart="ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org", path="/biomart/martservice", dataset="hsapiens_gene_ensembl")
message("Fetching genes for hg19")
} else if(release %in% c("hg17", "hg18")){
stop(paste(release, "is too old (hg18 = 2006)"))
} else{
stop("Incorrect release number")
}
atts <- c("hgnc_symbol", "chromosome_name", "start_position", "end_position", "gene_biotype")
if(!is.null(chr)){
p2g <- data.table(getBM(attributes = atts, mart = mart, filters = "chromosome_name", values = "X"))
} else{
p2g <- data.table(getBM(attributes = atts, mart = mart))
}
setnames(p2g, c("hgnc_symbol", "chromosome_name", "start_position", "end_position"), c("GENE", "CHROM", "start", "end"))
return(p2g)
}
#' Annotate
#'
#' Map positions of SNPs to genes extracted from biomaRt
#'
#' @param xciObj A \code{data.table}. The data to be annotated must contain
#' at least the 4 columns 'GENE', 'POS', 'AD_hap1', 'AD_hap2'. Additional
#' columns will be preserved.
#' @param read_count_cutoff A \code{numeric}. Keep only SNPs that have at least
#' that many reads.
#' @param het_cutoff A \code{numeric}. Keep only SNPs that have at least that
#' many reads on each allele.
#' @param release A \code{character}. Genome release name. Valid releases are
#' "hg19", "hg38".
#' @param verbose A \code{logical}. If set to TRUE, print additional information.
#'
#' @return A \code{data.table}. The input table annotated with gene symbols
#' and filtered for read counts.
#'
#' @example inst/examples/workflow.R
#'
#' @export
annotateX <- function(xciObj, read_count_cutoff = 20, het_cutoff = 3,
release = "hg38", mirror = NULL, verbose = FALSE){
# 1) Extract the genes
egX <- mart_genes(release = release, mirror = mirror)[GENE != ""]
egX <- egX[gene_biotype %in% c("protein_coding", "pseudogene", "lncRNA")] #Remove miRNA, rRNA,
# Handle genes that have multiple start/end positions
dupg <- egX[duplicated(egX$GENE), GENE]
if(verbose){
message(paste("There are", length(dupg), "genes with multiple entries."))
}
egX[, start:= min(start), by = "GENE"]
egX[, end:= max(end), by = "GENE"]
egX <- unique(egX)
#2) Map positions to the genes
# Filtering first to make everything faster
xciAnno <- xciObj[AD_hap1 + AD_hap2 >= read_count_cutoff & AD_hap1 >= het_cutoff & AD_hap2 >= het_cutoff]
if(!"CHROM" %in% names(xciAnno)){
warning("'CHROM' not found in the object, assuming all entries are X-linked.")
xciAnno[, CHROM := "X"]
}
xciAnno[, ID := paste(CHROM, POS, sep = ":")]
p2gX <- .pos2gene(xciAnno[, list(ID, POS)], egX)
p2gX[, GENE_pos := (start+end)/2] #This can be useful for plots to order genes
xciAnno <- merge(p2gX[, list(ID, GENE, GENE_pos)], xciAnno, by = "ID") #The merge naturally removes intergenic SNPs
return(xciAnno)
}
.pos2gene <- function(target, interval){
pos <- unique(copy(target)) #Still need to copy b/c if target is already unique we'll overwrite
gene <- copy(interval)
# TODO:
# a) Handle chromosome
# b) Fetch genes from biomaRt
# INPUTS
# Assume first column is an ID and second is the positions
setnames(pos, c("ID", "POS"))
pos[, start := POS]
pos[, end := POS]
setkey(pos, start, end)
# Assume genes have columns GENE, start, end
olps <- foverlaps(gene, pos, type = "any")
ret <- olps[!is.na(ID), list(ID, POS, GENE, i.start, i.end)]
setnames(ret, c("i.start", "i.end"), c("start", "end"))
return(ret)
}
################################################################################
# Annotation using seqminer
################################################################################
#' @importFrom seqminer annotatePlain makeAnnotationParameter
.seqm_anno <- function(input, output_dir, reference = NULL, geneFile = NULL){
if(is.null(reference)){
reference <- system.file("extdata", "human.g1k.v37.fa", package = "XCIR")
if(!file.exists(reference))
stop("No reference file given or found.")
}
if(is.null(geneFile)){
geneFile <- system.file("extdata", "refFlat.gencode.v19.gz", package = "XCIR")
if(!file.exists(geneFile))
stop("No gene file given or found.")
}
inFile <- tempfile(tmpdir = output_dir)
.write_siteRef(input, ref_file = inFile) #Write annotations into the file that will be used by seqminer
param <- (list(reference = reference, geneFile = geneFile, inputFormat = "plain"))
output_file <- file.path(output_dir, "anno_out")
capture.output({
annotatePlain(inFile, output_file, params=param) #seqminer does not allow using an existing filename
})
return(output_file)
}
#' Read annotation file
#'
#' Read a given annotation file and merge it with a data.table containing the
#' relevant information to estimate inactivated X chromosome expression and
#' filter out SNPs with low coverage.
#'
#' @param dt A \code{data.table} object.
#' @param seqm_annotate A \code{logical}. If set to TRUE, the \code{seqminer}
#' package will be used to annotate \code{dt}. If set to FALSE, this function
#' is a simple read count filtering step.
#' @param read_count_cutoff A \code{numeric}. Keep only SNPs that have at least
#' that many reads.
#' @param het_cutoff A \code{numeric}. Keep only SNPs that have at least that
#' many reads on each allele.
#' @param filter_pool_cutoff A \code{numeric}. Keep only SNPs that have at
#' least that many reads on each allele across all samples. See details for
#' more information.
#' @param anno_file A \code{character}. The name of a file containing annotations.
#'
#' @details
#' If the samples all have the same genotype (e.g: technical replicates),
#' \code{filter_pool_cutoff} will sum counts across samples and preserve SNPs
#' that pass the cutoff on both the reference and alternate alleles. This
#' may lead to samples with 0 counts on either allele but will prevent removing
#' heterozygous sites with lower coverage (especialliy in skewed samples).
#' \code{seqm_anno} will call \code{annotatePlain} from the \code{seqminer}
#' package. For convenience, \code{seqminer}'s necessary annotation sources can
#' be copied into \code{XCIR}'s extdata folder. See ?annotatePlain for more
#' information.
#'
#' @return A \code{data.table} object that contains allelic coverage, genotype
#' and annotations at the covered SNPs.
#'
#' @example inst/examples/workflow.R
#'
#' @seealso \code{annotatePlain}
#'
#' @export
addAnno <- function(dt, seqm_annotate = TRUE, read_count_cutoff = 20,
het_cutoff = 3, filter_pool_cutoff = 3, anno_file = NULL){
dt <- dt[AD_hap1 + AD_hap2 > read_count_cutoff]
if(seqm_annotate){
output_dir <- tempdir()
anno_file <- .seqm_anno(input = copy(dt), output_dir)
} else if(is.null(anno_file)){
stop("No annotations given, set seqm_annotate to TRUE or provide an annotation filename to anno_file")
}
anno <- fread(anno_file)
setnames(anno, gsub("#", "", names(anno)))
anno[, GENE := gsub("^.*:", "", ANNO)]
anno[, ANNO := gsub(":.*$", "", ANNO)]
anno[, CHROM := as.character(CHROM)]
anno[, POS := as.numeric(POS)]
anno[, ANNO_FULL := NULL]
anno <- anno[GENE != "Intergenic"]
DPR_anno <- merge(unique(anno), dt, by = c("CHROM", "POS", "REF", "ALT"))
# Finding heterozygous SNPs
if(filter_pool_cutoff > 0){
poskp <- DPR_anno[, list(sum(AD_hap1), sum(AD_hap2)), by = c("CHROM", "POS")][V1 >= filter_pool_cutoff & V2 >= filter_pool_cutoff][, POS]
DPR_anno <- DPR_anno[POS %in% poskp]
}
DPR_anno <- DPR_anno[AD_hap1 >= het_cutoff & AD_hap2 >= het_cutoff]
return(DPR_anno)
}
#' @export
.write_siteRef <- function(dt, ref_file = "test_ref.anno.tsv"){
write.table(unique(dt[, list(CHROM, POS, REF, ALT)]), file = ref_file,
row.names = FALSE, quote = FALSE, sep = "\t")
}
#' XCI consensus
#'
#' Read consensus & XCIR calls for all X-linked genes
#'
#' @param redownload A \code{logical}. If set to TRUE, the original supplementary
#' file is redownloaded from PMC.
#' @param simple A \code{logical}. If set to TRUE, minimal information is returned,
#' only for genes with an available XCIR classification.
#'
#' @details
#' The consensus is as published in Supplementary table S1 of
#' Balaton et al. (Biol Sex Differ. 2015). doi: 10.1186/s13293-015-0053-7
#'
#' @return A \code{data.table} with the annotated X-linked genes.
#'
#' @examples
#' consensusXCI(simple = TRUE)
#'
#' @importFrom readxl read_xlsx
#' @importFrom utils download.file
#' @export
consensusXCI <- function(redownload = FALSE, simple = TRUE){
if(redownload){
tmp = tempfile()
download.file("https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4696107/bin/13293_2015_53_MOESM1_ESM.xlsx", destfile = tmp, mode = "wb")
cons <- data.table(read_xlsx(tmp))
} else{
cons <- data.table(read_xlsx(system.file("extdata/13293_2015_53_MOESM1_ESM.xlsx", package = "XCIR")))
}
setnames(cons, tolower(chartr(" ", "_", names(cons))))
# geu <- fread(system.file("extdata/GEU_xcir_181122_flag.tsv", package = "XCIR"))
# geuall <- getXCIstate(geu)
# geuskew <- getXCIstate(geu[f < .25])
# geus <- merge(geuall, geuskew, by = "GENE", suffixes = c("_all", "_skew"), all.x = TRUE)
geus <- fread(system.file("extdata/GEU_xcistate_181122.tsv", package = "XCIR"))
cons <- merge(geus, cons, by.x = "GENE", by.y = "gene_name", all.y = TRUE)
cons <- cons[, c(names(geus), "balaton_consensus_calls", "y_homology"), with = FALSE]
if(simple)
cons <- cons[!is.na(XCIstate_skew)]
return(cons)
}
#' Annotate
#'
#' Find SNPs based on matching position in dbgap using biomaRt
#'
#' @param object A \code{data.table} with genomic positions. Typically, the
#' table returned by \code{betaBinomXI}.
#' @param pos_col A \code{character} indicating the column that contains genomic
#' positions.
#' @param chr A \code{character}. Filter SNPs for the selected chromosome.
#' @export
annotates_snps <- function(object, pos_col = "POS", chr = "X"){
# read from file instead of biomaRt
positions <- object[, get(pos_col)]
mart <- useMart(biomart="ENSEMBL_MART_SNP", dataset="hsapiens_snp")
atts <- c("refsnp_id", "chrom_start")
snps <- getBM(atts, filters = "chr_name", values = chr, mart = mart)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.