##########################################
# Goal: Generate the GRangesList object for the hg19 varbins at 100k and
# 200k resolution.
#
# Inputs:
# https://github.com/navinlabcode/CNV_pipeline/tree/master/lib/
# - varbin.gc.content.100k.bowtie.k50.hg19.bin_not_removed.txt
# - varbin.gc.content.200k.bowtie.k50.hg19.bin_not_removed.txt
#
# Output: 'sysdata.rda' which additionally contains a GRangesList object
# named 'varbin_hg19_grangeslist'.
#
# 1. Convert the raw coordinate from a mixture of 0-based and 1-based to 1-based.
# For example, start=0, end=977835, bin_length=977386.
# Equivalently, pure 0-based: [0, 977836); pure 1-based: [1, 977836]
# 2. Provide the correct genomic locations and attache meta information (e.g.,
# GC content) for all varbins.
# 3. Chr-style is chr1, chr2, ..., chrX, chrY. (No mitochondrion chromosome)
#
##########################################
# Author: Yun Yan (yun.yan@uth.tmc.edu)
##########################################
library(devtools)
library(usethis)
library(GenomicRanges)
library(GenomeInfoDb)
# VarBins annotations are from the official pipeline's lib: UCSC style (chr1, chrX, chrY)
fpath_varbin <- list(
`res_100k` = "/volumes/seq/code/PIPELINES/CNA_pipeline_v1.4/lib/varbin.gc.content.100k.bowtie.k50.hg19.bin_not_removed.txt",
`res_200k` = "/volumes/seq/code/PIPELINES/CNA_pipeline_v1.4/lib/varbin.gc.content.200k.bowtie.k50.hg19.bin_not_removed.txt"
)
make_granges_from_varbin_file <- function(x) {
df <- read.delim(x, header = T, stringsAsFactors = F)
colnames(df) <- gsub(pattern = "\\.", replacement = "_", x = colnames(df))
# Input:
# bin.chrom bin.start bin.end bin.length
# chr1 0 977835 977836
# chr1 977836 1200862 223027
# chr1 1200863 1455237 254375
# chr1 1455238 1758056 302819
# ...
# chr2 0 237130 237131
# chr2 237131 454244 217114
# chr2 454245 660285 206041
## Convert to pure 0-based coordination
df$bin_end <- df$bin_end + 1
## Convert to a legal GRange object (automatically switched to 1-based)
gr <- makeGRangesFromDataFrame(
df = df,
keep.extra.columns = T,
seqnames.field = "bin_chrom",
start.field = "bin_start",
end.field = "bin_end",
starts.in.df.are.0based = TRUE
)
return(gr)
# Output:
# seqnames ranges strand | bin_length
# [1] chr1 1-977836 * | 977836
# [2] chr1 977837-1200863 * | 223027
# [3] chr1 1200864-1455238 * | 254375
# ...
# [1] chr2 1-237131 * | 237131
# [2] chr2 237132-454245 * | 217114
# [3] chr2 454246-660286 * | 206041
}
list_gr <- lapply(fpath_varbin, make_granges_from_varbin_file)
varbin_hg19_grangeslist <- GRangesList(unlist(list_gr))
cat(length(varbin_hg19_grangeslist), "resolutions are available:\n")
cat(names(varbin_hg19_grangeslist), "\n")
cat("How many varbins are available per resolution?:\n")
print(sapply(varbin_hg19_grangeslist, length))
hg19_seqinfo <- GenomeInfoDb::Seqinfo(genome = "hg19")
GenomeInfoDb::seqinfo(varbin_hg19_grangeslist) <- hg19_seqinfo
# Export to package internal data
# devtools::load_all('./')
# usethis::use_data(
# major_palette, minor_palette, hg19_genes, ## previously available
# varbin_hg19_grangeslist, ## Added in this script
# internal = TRUE, overwrite = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.