# function to determine hmC level from mlml output file
process.hmc <- function(file, output_folder, Coverage){
if(!is.character(file))
stop('file has to be of class character ..')
if(!is.character(output_folder))
stop('output_folder has to be of class character ..')
if(!is(Coverage,"GRanges"))
stop('Coverage has to be of class GRanges with column name coverage..')
output_folder <- normalizePath(output_folder)
sample_name <- unlist(strsplit(file, split=".txt"))
output.files <- list()
temp_data <- fread(file, sep="\t", header=FALSE)
temp_data_gr <- GRanges(temp_data[,1],IRanges(temp_data[,2], temp_data[,2]))
ov <- findOverlaps(temp_data_gr, Coverage)
mcols(temp_data_gr)$coverage <- 0
ind1 <- queryHits(ov)
ind2 <- subjectHits(ov)
mcols(temp_data_gr)$coverage[ind1] <- mcols(Coverage)$coverage[ind2]
temp_data[,4] <- round((temp_data[,4]*mcols(temp_data_gr)$coverage))
temp_data[,5] <- round((temp_data[,5]*mcols(temp_data_gr)$coverage))
temp_data[,6] <- round((temp_data[,6]*mcols(temp_data_gr)$coverage))
ind <- which(temp_data[,7]==0)
temp_data <- temp_data[ind,]
temp_data_CpG <- temp_data[,c(1,2,4,6)]
temp_data_hmC <- temp_data[,c(1,2,5,6)]
filename <- file.path(output_folder,
paste(sample_name, "_", "CpG", ".txt", sep=""))
write.table(temp_data_CpG, file=filename)
filename <- file.path(output_folder,
paste(sample_name, "_", "hmC", ".txt", sep=""))
write.table(temp_data_hmC, file=filename)
}
# function to determine the methylation
meth.call <- function(files_location, output_folder, no_overlap, read.context, Nproc){
if(!is.character(files_location))
stop('files_location has to be of class character ..')
if(!is.character(output_folder))
stop('output_folder has to be of class character ..')
if(!is.logical(no_overlap))
stop('no_overlap has to be of class logical ..')
if(!is.character(read.context))
stop('read.context has to be of class character ..')
if(!is.numeric(Nproc))
stop('Nproc has to be of class numeric ..')
files_location <- normalizePath(files_location)
output_folder <- normalizePath(output_folder)
# read.type
if( !( read.context %in% c("CpG","All")))
stop("wrong 'read.context' argument supplied, only 'CpG' or 'All' accepted")
# list of input sam files
all_files <- list.files(path = files_location, pattern = ".sam")
sample_name <- unlist(strsplit(all_files, split=".sam"))
# list of output files
output.files <- list()
output.files[[read.context]] <- file.path(output_folder,
paste(sample_name, "_", read.context, ".txt", sep=""))
# creation of system command
read.files <- file.path(files_location, all_files)
perl.arguments <- paste("--file", read.files, "--sam_type paired_sam")
if(read.context == "CpG" )
{perl.arguments <- paste(perl.arguments, "--CpG", output.files[["CpG"]] )}
if(read.context == "All" )
{perl.arguments <- paste(perl.arguments, "--All", output.files[["All"]] )}
if(no_overlap){perl.arguments <- paste(perl.arguments, "--no_overlap" )}
# location of the perl script
perl.loc <- (system.file("exec", "methinfo.pl", package="methylPipe"))
cmd <- paste("perl", perl.loc, perl.arguments)
# then call perl to process the file
message( )
message("Extracting methylation information from input SAM files and creating output files for each sample...")
message()
runperl <- function(x)
{
status <- try(system(x))
if(status != 0)
{stop("\nError in methylation calling...\n
Check if it is a sorted Bismark SAM file\n")}
}
parallel::mclapply(cmd, runperl, mc.cores=Nproc, mc.preschedule=TRUE)
message("Creating temporary BAM files from input SAM files...")
message()
setwd(output_folder)
system('mkdir tempBAM')
temp_folder <- paste0(output_folder,"/tempBAM")
for (i in 1:length(all_files))
asBam(file.path(files_location, all_files[i]), destination=file.path(temp_folder, sample_name[i]), overwrite=TRUE)
message("Creating uncovered regions objects for each sample from BAM files...")
message()
bam.files <- file.path(path = temp_folder, paste(sample_name, ".bam", sep=""))
for (i in 1:length(all_files))
{
aln_file <- bam.files[i]
aln <- readGAlignments(aln_file)
cov <- coverage(aln)
#ind <- sapply(cov, function(x)(length(which(runValue(x)== 0))))
ind <- sapply(cov, function(x)(length(runValue(x))))
cov <- cov[which(ind > 1)]
uncov_GR <- as(cov, "GRanges")
uncov_GR <- uncov_GR[mcols(uncov_GR)$score== 0]
filename <- file.path(output_folder, paste0(sample_name[[i]],"_uncov", ".Rdata"))
Objectout <- paste0(sample_name[[i]],"_uncov")
assign(Objectout, uncov_GR)
save(list=Objectout, file=filename)
}
message('Removing all temporary BAM files...')
message()
str <- paste("rm -r","tempBAM")
system(str)
message('Methylation info and Uncovered regions output files created for each sample in output_folder...')
message()
message("Processing done Successfully...")
message()
}
#### combining reads of replicates within a single group
pool.reads <- function(files_location)
{
if(!is.character(files_location))
stop('files_location has to be of class character ..')
files_location <- normalizePath(files_location)
all_files <- list.files(files_location,pattern=".txt")
setwd(files_location)
cmd <- paste("cat", paste(all_files, collapse= " "), ">", paste0(files_location, "/", "sample_merge.txt"))
system(cmd)
python.loc <- system.file("exec", "pool_reads.py", package="methylPipe")
merge_file_loc <- paste0(files_location, "/", "sample_merge.txt")
cmd <- paste("python", python.loc, merge_file_loc, ">",paste0(files_location, "/","sample_merge_pooled.txt"))
system(cmd)
cmd <- paste("sort -k1,1 -k2,4n", paste0(files_location, "/", "sample_merge_pooled.txt") , ">",
paste0(files_location, "/", "sample_merge_pooled_sort.txt"))
system(cmd)
cmd <- paste("rm","*_merge.txt")
system(cmd)
cmd <- paste("rm","*_merge_pooled.txt")
system(cmd)
}
tabixdata2GR <- function(x) {
if(length(x) == 0) return(NA)
temp <- strsplit(x, split="\t")
mat <- matrix(unlist(temp), ncol=7, byrow=TRUE)
df <- as.data.frame(mat, stringsAsFactors=FALSE)
df_gr <- GRanges(df[,1], IRanges(as.numeric(df[,2]), as.numeric(df[,2])),
strand=df[,3], Context=df[,4], C=as.numeric(df[,5]), T=as.numeric(df[,6]), Significance=as.numeric(df[,7]))
rm(df)
rm(mat)
return(df_gr)
}
BSdata <- function(file, uncov, org) {
bsdata <- new('BSdata', file=file, uncov=uncov, org=org)
return(bsdata)
}
BSdataSet <- function(org, group, ...) {
Objname <- names(list(...))
if(!is.null(Objname))
new("BSdataSet", org=org, group=group, Objlist=list(...), names=names(list(...)))
else stop('define the names for the Objects in the arguments..')
}
GElist <- function(...)
{
Objname <- names(list(...))
if(!is.null(Objname))
new("GElist", Objlist=list(...), names=names(list(...)))
else stop('define the names for the Objects in the arguments..')
}
BSprepare <- function(files_location, output_folder, tabixPath, bc=1.5/100) {
if(!is.character(files_location))
stop('files_location has to be of class character ..')
if(!is.character(output_folder))
stop('output_folder has to be of class character ..')
if(!is.character(tabixPath))
stop('tabixPath has to be of class character ..')
if(!file.exists(paste(tabixPath, '/tabix', sep='')))
stop('tabix not found at tabixPath ..')
if(!file.exists(paste(tabixPath, '/bgzip', sep='')))
stop('bgzip not found at tabixPath ..')
files_location <- normalizePath(files_location)
output_folder <- normalizePath(output_folder)
tabixPath <- normalizePath(tabixPath)
binomTestMulti <- function(mat, maxN=50, p=bc, bh=TRUE) {
bmat <- matrix(NA, maxN, maxN)
for(i in 1:maxN) {
for(j in i:maxN) bmat[i,j] <- binom.test(x=i, n=j, p=p,
alternative='greater')$p.value
}
pVfun <- function(x) {
if(x[2] < (maxN+1)) return(bmat[x[1], x[2]])
else return(binom.test(x=x[1], n=x[2], p=p,
alternative='greater')$p.value)
}
pValues <- apply(mat, 1, pVfun)
if(bh) pValues <- p.adjust(pValues, method='BH')
pValues <- -round(10*log10(pValues))
return(pValues)
}
####### checking for the chromosome column ##########
all_files <- list.files(path = files_location, pattern = ".txt")
for (i in 1:length(all_files))
{
path <- paste0(files_location,"/",all_files[[i]])
temp_data <- fread(path,nrows=10)
if(length(grep("chr",temp_data$V1))==0)
{
cmd <- paste("sed -i 's/\"//g'",path)
system(cmd)
cmd <- paste("sed -i 's/^/chr/'",path)
system(cmd)
}
}
############### sorting chromosome files ############
sample_name <- unlist(strsplit(all_files, split=".txt"))
cmd <- paste("sort -k1,1 -k2,4n", paste0(files_location, "/", all_files) , ">",
paste0(files_location, "/", sample_name,"_sort.txt"))
sapply(cmd,system)
cmd <- paste("mv", paste0(files_location, "/", sample_name,"_sort.txt"),
paste0(files_location, "/", sample_name,".txt"))
sapply(cmd,system)
######################################################
for(i in 1:length(all_files)) {
filecg <- all_files[i]
message(filecg)
filemC <- paste0(files_location, "/", filecg, '.mC')
str <- paste('cut -f5,6', paste0(files_location, "/", filecg), '>', filemC)
system(str)
# computing binomial pvalues
mCdat <- fread(filemC)
mCdat <- as.matrix(mCdat)
mCdat[,2] <- mCdat[,1]+ mCdat[,2]
pV <- binomTestMulti(mCdat, p=bc)
# attaching pvalues
filepV <- paste0(filemC, '.pvalues')
fileTabix <- paste0(files_location, "/", sample_name[i], '_tabix.txt')
write(pV, file=filepV, ncolumns=1)
str <- paste('paste', paste0(files_location, "/", filecg), filepV, '>', fileTabix)
system(str)
system(paste('rm', filepV, filemC))
}
# concatenating, compressing and building tabix index
message('postprocessing ..')
Tabix_files <- list.files(path = files_location, pattern = "_tabix.txt")
# generating the TABIX compressed file
for(filetb in Tabix_files) {
filetb_name <- unlist(strsplit(filetb,split=".txt"))
str <- paste('cat', paste0(files_location, "/", filetb), '>',
paste0(output_folder, "/", filetb_name,"_out.txt"))
system(str)
str <- paste0(tabixPath, '/bgzip ', output_folder, "/", filetb_name,"_out.txt")
system(str)
# generating the TABIX index file
fileoutgz <- paste0(output_folder, "/", filetb_name,"_out.txt", '.gz')
str <- paste(tabixPath, '/tabix -s 1 -b 2 -e 2 -f ', fileoutgz, sep='')
system(str)
}
}
# splitting chromosomes in N Mb regions ..
splitChrs <- function(chrs, org) {
if(!is(org,"BSgenome") && !is(org,"list"))
stop('org has to be either a list or an object of class BSgenome ..')
chrs_org <- seqnames(org)
if(any(!chrs %in% chrs_org))
stop('This is not a valid chromosome .. ')
chrL <- seqlengths(org)[chrs]
splitDf <- data.frame(chrs, 1, chrL, row.names = NULL, stringsAsFactors=FALSE)
colnames(splitDf) <- c('chr','start','end')
return(splitDf)
}
# fisher's method to determine a Pvalue based on a set of Pvalues
chiCombP <- function(Pvalues) {
chistat <- -2*(sum(log(Pvalues)))
chiP <- 1-pchisq(chistat, df=2*length(Pvalues))
return(chiP)
}
# consolidating DMRs ..
consolidateDMRs <- function(DmrGR, pvThr=0.05, MethDiff_Thr=NULL, log2Er_Thr=NULL,
GAP=0, type=NULL, correct=FALSE){
if(!is(DmrGR, "GRanges"))
stop('DmrGR has to be of class GRanges ..')
if(!is.numeric(pvThr))
stop('pvThr has to be of class numeric ..')
if(!is.null(MethDiff_Thr) && !is.numeric(MethDiff_Thr))
stop('MethDiff_Thr has to be either NULL or of class numeric ..')
if(!is.null(log2Er_Thr) && !is.numeric(log2Er_Thr))
stop('log2Er_Thr has to be either NULL or of class numeric ..')
if(!is.numeric(GAP))
stop('GAP has to be of class numeric ..')
if(!is.null(MethDiff_Thr) && any(!type %in% c("hyper","hypo")))
stop('type has to be either NULL or one of hyper and hypo ..')
if(!is.logical(correct))
stop('correct has to be of class logical ..')
if(correct) mcols(DmrGR)$pValue= p.adjust(mcols(DmrGR)$pValue, method='BH')
sInds <- which(mcols(DmrGR)$pValue <= pvThr)
if(!is.null(MethDiff_Thr)) {
# selecting based on methylation diff
mInds <- which(abs(mcols(DmrGR)$MethDiff_Perc) >= MethDiff_Thr)
sInds <- IRanges::intersect(sInds, mInds)
}
if(!is.null(log2Er_Thr)) {
# selecting based on log2enrichment
eInds <- which(abs(mcols(DmrGR)$log2Enrichment) >= log2Er_Thr)
sInds <- IRanges::intersect(sInds, eInds)
}
if(length(sInds) == 0) return(NULL)
DmrGR <- DmrGR[sInds,]
if(!is.null(type))
{
if(type=="hypo")
DmrGR <- DmrGR[mcols(DmrGR)$MethDiff_Perc < 0]
else
DmrGR <- DmrGR[mcols(DmrGR)$MethDiff_Perc > 0]
}
# building a GRanges for selected DMRs
joinDMR <- function(query, Gap=0, fun=chiCombP) {
if(!is.numeric(Gap))
stop('Gap has to be of class numeric .. ')
if(class(fun) != 'function')
stop('fun has to be of class function .. ')
commonChrs <- unique(as.character(seqnames(query)))
resIR_gr <- GRanges()
for(chrom in commonChrs) {
queryF <- query[seqnames(query) == chrom]
Pv <- mcols(queryF)$pValue
MethDiff <- mcols(queryF)$MethDiff_Perc
Enrichment <- mcols(queryF)$log2Enrichment
# extracting IRanges from the dataframe
IR1 <- IRanges(start(queryF), end(queryF))
IR2 <- IR1
if(Gap > 0) {
# Gap are added on each side of the genomic regions
# they will be taken back after the intersection
start(IR1) <- start(IR1) - Gap
end(IR1) <- end(IR1) + Gap
start(IR2) <- start(IR2) - Gap
end(IR2) <- end(IR2) + Gap
}
iIR <- IRanges::intersect(IR1, IR2)
if(length(iIR) == 0) next
if(Gap > 0) {
start(iIR) <- start(iIR) + Gap
end(iIR) <- end(iIR) - Gap
}
combPv <- rep(NA, length(iIR))
combMethDiff <- rep(NA, length(iIR))
combEnrichment <- rep(NA, length(iIR))
# determining which IR elements were combined in iIR
result1 <- findOverlaps(iIR, IR1)
res1 <- cbind(queryHits(result1), subjectHits(result1))
result2 <- findOverlaps(iIR, IR2)
res2 <- cbind(queryHits(result1), subjectHits(result1))
for(uind in unique(res2[,1])) {
inds <- which(res2[,1] == uind)
IR2ind <- res2[inds,2]
Pvs <- Pv[IR2ind]
Meth <- MethDiff[IR2ind]
Enr <- Enrichment[IR2ind]
combPv[uind] <- do.call('fun', list(Pvs))
combMethDiff[uind] <- do.call(mean, list(Meth))
combEnrichment[uind] <- do.call(mean, list(Enr))
}
suppressWarnings(resIR_gr <- append(resIR_gr, GRanges(Rle(chrom), IRanges(start(iIR), end(iIR)),
pValue=round(combPv,3), MethDiff_Perc=round(combMethDiff,3),
log2Enrichment=round(combEnrichment,3))))
}
# appending the new iIR to those generated for the other chromosomes
zeroWinds <- which(width(resIR_gr) == 1)
if(length(zeroWinds) > 0) resIR_gr <- resIR_gr[-zeroWinds,]
resIR_gr
}
dmrGRanges <- joinDMR(DmrGR, Gap=GAP)
return(dmrGRanges)
}
# retrieve mC calls for genomic regions given a BSdata object for a sample
mapBSdata2GRanges <- function(GenoRanges, Sample, context='all', mC=1, depth=0,
pValue=1) {
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!is(Sample, "BSdata"))
stop('Sample has to be of class BSdata ..')
if(length(which(!(context %in% c('all','CG','CHG','CHH')))) > 0)
stop('context has to be either all or a combination of CG, CHG, and CHH ..')
if(!is.numeric(mC) && mC < 0 )
stop('mC has to be of class numeric and positive..')
if(!is.numeric(depth))
stop('depth has to be of class numeric ..')
if(!is.numeric(pValue))
stop('pValue has to be of class numeric ..')
if(pValue > 1)
stop('pValue has to be lower than 1 ..')
# querying TABIX indexed files based on the tabix seqnames,
# assigning NA to GRanges regions in chrs not represented
tabixChrs <- seqnamesTabix(Sample@file)
representedChr <- which(as.character(seqnames(GenoRanges)) %in% tabixChrs)
if(length(representedChr) == 0)
{
res <- as.list(rep(NA, length(GenoRanges)))
return(res)
}
otherChr <- which(!(as.character(seqnames(GenoRanges)) %in% tabixChrs))
if(length(otherChr) > 0) res <- as.list(rep(NA, length(GenoRanges)))
regions <- GenoRanges[representedChr,]
resScan <- scanTabix(Sample@file, param=regions)
resScan <- lapply(resScan, tabixdata2GR)
if(length(otherChr) > 0) res[representedChr] <- resScan
else res <- resScan
# filtering results
filterFun <- function(gr, context, mC, depth, pValue) {
indsList <- list()
if(context[1] != 'all') indsList$cl <- which(mcols(gr)$Context %in% context)
if(mC > 0) indsList$mC <- which(mcols(gr)$C > mC)
if(depth > 0) indsList$d <- which((mcols(gr)$C+mcols(gr)$T) > depth)
pV <- -10*log10(pValue)
if(pValue < 1) indsList$p <- which(mcols(gr)$Significance > pV)
tableInds <- table(unlist(indsList))
inds <- names(tableInds)[tableInds == length(indsList)]
if(is.null(inds) || length(inds) == 0) return(NA)
return(gr[as.numeric(inds),])
}
if(context != 'all' || mC > 1 || depth > 0 || pValue < 1) {
NAinds <- as.numeric(which(is.na(res)))
if(length(NAinds) > 0) {
res[-NAinds] <- lapply(res[-NAinds], filterFun, context, mC, depth, pValue)
}
else res <- lapply(res, filterFun, context, mC, depth, pValue)
}
return(res)
}
# for GenomicRanges with N bins, extract the information
# of the Genomic ranges for a given bin
extractBinGRanges <- function(GenoRanges, bin, nbins) {
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!is.numeric(bin) || length(bin) != 1 || is.na(bin))
stop(' bin has to be a single non-NA integer ')
if(!is.numeric(nbins) || length(nbins) != 1 || is.na(nbins))
stop(' nbins has to be a single non-NA integer ')
if( bin < 1 || bin > nbins)
stop(' bin has to be between 1 and nbins ')
binsize <- round(width(GenoRanges)/nbins)
starts <- start(GenoRanges) + (bin-1)*binsize
gnew <- GRanges(as.character(seqnames(GenoRanges)),
IRanges(starts, width=binsize),
strand=as.character(strand(GenoRanges)))
return(gnew)
}
# As in mapBSdata2GRanges, but returns mC calls
# within each bin of each Genomic region
mapBSdata2GRangesBin <- function(GenoRanges, Sample,
context='all', mC=1, depth=0, pValue=1, nbins){
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!is(Sample,'BSdata'))
stop('Sample has to be of class BSdata ..')
if(length(which(!(context %in% c('all','CG','CHG','CHH')))) > 0)
stop('context has to be either all or a combination of CG, CHG, and CHH ..')
if(!is.numeric(mC) && mC < 0 )
stop('mC has to be of class numeric and positive..')
if(!is.numeric(depth))
stop('depth has to be of class numeric ..')
if(!is.numeric(pValue))
stop('pValue has to be of class numeric ..')
if(pValue > 1)
stop('pValue has to be lower than 1 ..')
if(!is.numeric(nbins))
stop('nbins has to be of class numeric ..')
resList <- list()
if(nbins == 1)
resList[[1]] <- mapBSdata2GRanges(GenoRanges, Sample, context)
else
{
for(bin in 1:nbins) {
gel <- extractBinGRanges(GenoRanges=GenoRanges, bin=bin, nbins=nbins)
resList[[bin]] <- mapBSdata2GRanges(GenoRanges=gel, Sample=Sample,
context, mC, depth, pValue)
}
}
resrev <- list()
for(GEind in 1:length(GenoRanges)) resrev[[GEind]] <-
lapply(resList, function(x) x[[GEind]])
return(resrev)
}
# get genomic Cxx positons for a series of genomic regions
getCpos <- function(GenoRanges, seqContext='all', nbins, org) {
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!(seqContext %in% c('all','CG','CHG','CHH')))
stop('seqContext has to be one of all, CG, CHG or CHH ..')
if(!is.numeric(nbins))
stop('nbins has to be of class numeric ..')
if(!is(org,"BSgenome"))
stop('org has to be of class BSgenome ..')
chrs <- as.character(seqnames(GenoRanges))
if(length(which(!(unique(chrs) %in% names(org)))) > 0)
stop('Object covers chromosomes not available in org ..')
resList <- as.list(rep(NA, length(GenoRanges)))
Nbins <- nbins
for(Chr in unique(chrs)) {
chrinds <- which(as.character(seqnames(GenoRanges)) == Chr)
GenoRangesChr <- GenoRanges[seqnames(GenoRanges) == Chr]
chrseq <- unmasked(org[[Chr]])
if(seqContext != 'all') resList[chrinds] <-
getCposChr(GenoRanges= GenoRangesChr, seqContext=seqContext,
chrseq=chrseq, Nbins)
else {
resCG <- resList[chrinds] <- getCposChr(GenoRanges= GenoRangesChr,
seqContext='CG', chrseq=chrseq, Nbins)
resCHG <- resList[chrinds] <- getCposChr(GenoRanges= GenoRangesChr,
seqContext='CHG', chrseq=chrseq, Nbins)
resCHH <- resList[chrinds] <- getCposChr(GenoRanges= GenoRangesChr,
seqContext='CHH', chrseq=chrseq, Nbins)
resALL <- resCG
for(i in 1:length(resALL)) {
if(Nbins > 1) { for(bin in 1:Nbins)
resALL[[i]][[bin]] <- sort(c(unlist(resCG[[i]][[bin]]),
unlist(resCHG[[i]][[bin]]), unlist(resCHH[[i]][[bin]]))) }
else resALL[[i]][[1]] <- sort(c(unlist(resCG[[i]]),
unlist(resCHG[[i]]), unlist(resCHH[[i]])))
}
resList <- resALL
}
}
return(resList)
}
# get genomic Cxx positons for a series of genomic regions on a given chr seq
getCposChr <- function(GenoRanges, seqContext, chrseq, nbins) {
if(length(unique(as.character(seqnames(GenoRanges)))) > 1)
stop('GenoRanges maps to several chromosomes ..')
if(!(seqContext %in% c('CG','CHG','CHH')))
stop('seqContext has to be one of CG, CHG or CHH ..')
if(!is(chrseq,"DNAString"))
stop('chrseq has to be of class DNAString (unmasked) ..')
if(!is.numeric(nbins))
stop('nbins has to be of class numeric ..')
typeD <- DNAString(seqContext)
typeDr <- reverse(DNAString(seqContext))
Nbins <- nbins
chrv <- Views(chrseq, start=start(GenoRanges), end=end(GenoRanges))
vls <- as(chrv,'DNAStringSet')
vlsC <- Biostrings::complement(vls)
Cpos <- startIndex(vmatchPattern(typeD, vls, fixed='subject'))
CposC <- endIndex(vmatchPattern(typeDr, vlsC, fixed='subject'))
for(i in 1:length(Cpos)) {
res <- c(Cpos[[i]], CposC[[i]])
if(!is.null(res)) Cpos[[i]] <- sort(res)
}
# restore bin structure (Nbins entry for each genomic region)
CposList <- list()
for(i in 1:length(Cpos))
{
binList <- list()
startPos <- 1
binsize <- round(width(GenoRanges)[i])/Nbins
endPos <- binsize
for(bin in 1:Nbins)
{
ind <- which(Cpos[[i]] >= startPos & Cpos[[i]] <= endPos)
binList[[bin]] <- Cpos[[i]][ind]
startPos <- endPos+1
endPos <- endPos+binsize
}
if(length(binList) == 0)
CposList[[i]] <- NA
else
CposList[[i]] <- binList
}
return(CposList)
}
getCposDensity<- function(GenoRanges, Cpos, nbins) {
allwidths <- width(GenoRanges)
Nbins <- nbins
for(i in 1:length(Cpos)) {
widths <- rep(round(allwidths[i]/Nbins), Nbins)
if(all(is.na(Cpos[[i]])))
Cpos[[i]] <- rep(0,nbins)
else
Cpos[[i]] <- sapply(Cpos[[i]], length)/widths
}
return(Cpos)
}
# assign binmC, binC, binrC for GEcollection using Tabix based BSdata ...
profileDNAmetBin <- function(GenoRanges, Sample,
mcCLASS='mCG',
mC=1, depthThr=0, mCpv=1,
minCoverage=0.75, nbins=2) {
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!is(Sample,"BSdata"))
stop('Sample has to be of class BSdata ..')
if(length(which(!(mcCLASS %in% c('mCG','mCHG','mCHH')))) > 0)
stop('mcCLASS has to be one of mCG, mCHG, and mCHH ..')
if(!is.numeric(mC) && mC < 0 )
stop('mC has to be of class numeric and positive..')
if(!is.numeric(depthThr))
stop('depthThr has to be of class numeric ..')
if(!is.numeric(mCpv))
stop('mCpv has to be of class numeric ..')
if(!is.null(minCoverage) && !is.numeric(minCoverage))
stop('minCoverage has to be either NULL or of class numeric ..')
if(!is.numeric(nbins))
stop('nbins has to be of class numeric ..')
Nbins <- nbins
# extracting the mC methylation level 2sec
bsdat <- mapBSdata2GRangesBin(GenoRanges=GenoRanges, Sample= Sample,
context=sub('m','',mcCLASS), mC=mC,
depth=depthThr, pValue=mCpv, nbins=Nbins)
# binmC
sumFun <- function(x) {
suppressWarnings(if(is.na(x)[[1]]) return(NA))
sum(mcols(x)$C/(mcols(x)$C+mcols(x)$T))
}
mlsum <- lapply(bsdat, sapply, sumFun)
binmC <- matrix(unlist(mlsum), length(mlsum), Nbins, byrow=T)
unmethList <- list()
uncov <- Sample@uncov
uncov <- uncov[mcols(uncov)$score <= depthThr]
seqlengths(uncov) <- NA
seqlengths(GenoRanges) <- NA
### missing sequencing coverage has value NA and unmethylated has value 0
for(bin in 1:Nbins) {
gel <- extractBinGRanges(GenoRanges=GenoRanges, bin=bin, nbins=Nbins)
ov <- findOverlaps(gel,uncov)
if(length(ov) > 0)
{
val <- rep(0,length(GenoRanges))
ind <- unique(queryHits(ov))
val[ind] <- NA
unmethList[[bin]] <- val
}
else
{
val <- rep(0,length(GenoRanges))
unmethList[[bin]] <- val
}
}
unmethList <- matrix(unlist(unmethList), Nbins, length(GenoRanges), byrow=T)
unmethList <- t(unmethList)
ind <- which(is.na(binmC))
binmC[ind] <- unmethList[ind]
wbp <- round(width(GenoRanges)/Nbins)
binmC <- apply(binmC, 2, function(x) x/wbp)
# binC
Cpos <- getCpos(GenoRanges= GenoRanges,
seqContext= sub('m','', mcCLASS),
org= Sample@org, nbins=Nbins)
CposD <- getCposDensity(GenoRanges, Cpos=Cpos, nbins=Nbins)
binC <- matrix(unlist(CposD), length(CposD), Nbins, byrow=T)
# reversing for minus strand if strand is defined
if(is.null(dim(binmC)))
binmC <- matrix(binmC, nrow=1, ncol=length(binmC), byrow=T)
minusStrand <- which(as.character(strand(GenoRanges)) == '-')
if(length(minusStrand) > 0) {
binmC[minusStrand,] <- binmC[minusStrand, Nbins:1]
binC[minusStrand,] <- binC[minusStrand, Nbins:1]
}
# adding relative methylation (C/mC) expressed on [0,100] range
if(is.null(ncol(binmC)))
binmC <- matrix(binmC, ncol=length(binmC))
binrC <- binmC
for(i in 1:ncol(binrC)) binrC[,i] <- 100*binmC[,i]/binC[,i]
binrC[is.infinite(binrC)] <- NA
# adding uncovered region information
maskUncovered <- uncov
suppressWarnings(nonCoveredInds <- findOverlaps(GenoRanges, maskUncovered))
orgInds <- queryHits(nonCoveredInds)
binrC[orgInds,] <- NA
# cutting some decimals and specifying row/column names for the slots
row_names <- paste(as.character(seqnames(GenoRanges)),
":", start(GenoRanges),"-", end(GenoRanges), sep="")
col_names <- c(1:Nbins)
binmC <- signif(binmC, 3)
colnames(binmC) <- col_names
rownames(binmC) <- NULL
binC <- signif(binC, 3)
colnames(binC) <- col_names
rownames(binC) <- NULL
binrC <- signif(binrC, 3)
colnames(binrC) <- col_names
rownames(binrC) <- NULL
binscore <- matrix(NA, nrow(binmC),
ncol(binmC), dimnames=list(row_names, col_names))
rownames(binscore) <- NULL
#rownames(GenoRanges) <- row_names
# saving chr results
Object <- new("GEcollection",
SummarizedExperiment(assays=list(binmC=binmC,
binC=binC, binrC=binrC,
binscore=binscore),
rowRanges=GenoRanges))
#rownames(Object) <- row_names
Object
}
profileDNAmetBinParallel <- function(GenoRanges, Sample, mcCLASS='mCG',
mC=1, depthThr=0, mCpv=1,
minCoverage=0.75, Nproc=1, nbins=2){
if(!is(GenoRanges, "GRanges"))
stop('GenoRanges has to be of class GRanges ..')
if(!is(Sample,"BSdata"))
stop('Sample has to be of class BSdata ..')
if(length(which(!(mcCLASS %in% c('mCG','mCHG','mCHH')))) > 0)
stop('mcCLASS has to be one of mCG, mCHG, and mCHH ..')
if(!is.numeric(mC) && mC < 0 )
stop('mC has to be of class numeric and positive..')
if(!is.numeric(depthThr))
stop('depthThr has to be of class numeric ..')
if(!is.numeric(mCpv))
stop('mCpv has to be of class numeric ..')
if(!is.null(minCoverage) && !is.numeric(minCoverage))
stop('minCoverage has to be either NULL or of class numeric ..')
if(!is.numeric(Nproc) && Nproc < 1)
stop('Nproc has to be of class numeric ..')
if(!is.numeric(nbins))
stop('nbins has to be of class numeric ..')
Chrs <- unique(as.character(seqnames(GenoRanges)))
tabixChrs <- seqnamesTabix(Sample@file)
otherChr <- which(!(as.character(seqnames(GenoRanges)) %in% tabixChrs))
otherChr <- which(!(as.character(seqnames(GenoRanges)) %in% tabixChrs))
if(length(otherChr) > 0) GenoRanges <- GenoRanges[-c(otherChr)]
Chrs <- unique(as.character(seqnames(GenoRanges)))
Nproc <- min(Nproc, length(Chrs))
cl <- makeCluster(Nproc, 'PSOCK')
# a load balanced parallel execution of the profileDNAmetBin method is run
profileDNAmetBinChr <- function(CHR, GenoRanges,
Sample, mcCLASS, mC,
depthThr, mCpv,
minCoverage, nbins) {
GenoRangesChr <- GenoRanges[seqnames(GenoRanges) == CHR]
res <- profileDNAmetBin(GenoRanges=GenoRangesChr, Sample=Sample,
mcCLASS=mcCLASS, mC=mC, depthThr=depthThr,
mCpv=mCpv, minCoverage=minCoverage, nbins=nbins)
res
}
clRes <- clusterApplyLB(cl, Chrs, profileDNAmetBinChr,
GenoRanges=GenoRanges, Sample=Sample,
mcCLASS=mcCLASS, mC=mC, depthThr=depthThr,
mCpv=mCpv, minCoverage=minCoverage, nbins=nbins)
chrcomb <- NULL
chrcomb <- do.call("rbind", clRes)
Object <- new("GEcollection", SummarizedExperiment(assays=assays(chrcomb),
rowRanges(chrcomb)))
Object
}
# function to plot gene locus visualization with methylation/omics data
plotMeth <- function(grl=NULL, colors=NULL, datatype=NULL, yLim, brmeth=NULL, mcContext="CG", annodata=NULL,
transcriptDB, chr, start, end, org){
if(!is.null(grl) && !is(grl,'list') && !is(grl,'GElist'))
stop('grl has to be either NULL or an object of class list or GElist...')
if(is(grl,'list'))
{
if(any(!(sapply(grl, class) %in% c("GRanges", 'GEcollection'))))
stop('grl has to be a list of either GRanges or GEcollection object...')
}
if(!is.null(brmeth) && !is(brmeth,'list'))
stop('brmeth has to be either NULL or of class list ...')
if(is(brmeth,'list'))
{
if(any(!(sapply(brmeth, class) %in% c("BSdata"))))
stop('grl has to be a list of BSdata object...')
}
if(is.null(grl) && is.null(brmeth))
stop('both grl and brmeth cannot be NULL...')
if(!is.null(colors) && !is.character(colors))
stop('colors has to be either NULL or of class character ...')
if(!is.null(grl))
{
if(is.null(names(grl)))
stop('names of grl cannot be NULL...')
if(is.null(datatype))
stop('datatype cannot be NULL when grl is defined and has to be of length equal to grl...')
if(is.character(colors))
{
if(length(colors) != length(grl))
stop('length of colors has to be same as length of grl ...')
}
if(!is.numeric(yLim))
stop('yLim has to be of class numeric ..')
if(!is.null(datatype))
{
for(i in 1:length(datatype)) {
dti <- datatype[i]
if(!(dti %in% c('density', 'C','mC', 'rC','cols')))
stop('datatype has to be an array containing
one of: density, C, mC, rC, cols')
}
if(length(datatype) != length(grl) && length(grl) != length(yLim))
stop('grl, datatype and yLim has to be of same length ..')
}
}
if(length(which(!(mcContext %in% c('all','CG','CHG','CHH')))) > 0)
stop('mcContext has to be one of all, CG, CHG, and CHH ..')
if(!is.null(annodata) && !is(annodata,'GRangesList'))
stop('annodata has to be either NULL or of class GRangesList ...')
if(!is(transcriptDB,"TxDb") && !is.null(transcriptDB))
stop('transcriptDB has to be either NULL or an object of class TxDb ..')
if(!is.character(chr))
stop('chr has to be of class character ..')
if(!is.numeric(start))
stop('start has to be of class numeric ..')
if(!is.numeric(end))
stop('end has to be of class numeric ..')
if(!is(org, "BSgenome"))
stop('org has to be of class BSgenome ..')
gen <- org@provider_version
itrack <- IdeogramTrack(genome = gen, chromosome = chr)
axisTrack <- GenomeAxisTrack()
dTrack <- NULL
brTrack <- NULL
AnnoTrack <- NULL
txTrack <- NULL
if(!is.null(grl))
{
matlist <- list()
dTrack <- list()
for (i in 1:length(grl))
{
grl_trackname <- names(grl)
if(datatype[i] == 'mC') {
refgr <- rowRanges(grl[[i]])
matlist[[i]] <- t(apply(binmC(grl[[i]]),1,mean))
}
if(datatype[i] == 'C') {
refgr <- rowRanges(grl[[i]])
matlist[[i]] <- t(apply(binC(grl[[i]]),1,mean))
}
if(datatype[i] == 'rC') {
refgr <- rowRanges(grl[[i]])
matlist[[i]] <- t(apply(binrC(grl[[i]]),1,mean))
}
if(datatype[i] == 'density') {
refgr <- rowRanges(grl[[i]])
matlist[[i]] <- t(apply(binscore(grl[[i]]),1,mean))
}
if(datatype[i] == 'cols') {
refgr <- grl[[i]]
matlist[[i]] <- t(as.matrix(mcols(grl[[i]])[1]))
}
if(is.null(colors))
dTrack[[i]] <- DataTrack(range=refgr, data=matlist[[i]] , name=grl_trackname[i], chromosome=chr,
fill="darkgreen", ylim=c(0, yLim[i]))
else
dTrack[[i]] <- DataTrack(range=refgr, data=matlist[[i]] , name=grl_trackname[i], chromosome=chr,
fill=colors[i], ylim=c(0, yLim[i]))
}
}
brmeth_gr <- GRanges(chr,IRanges(start,end))
if(abs(start-end) < 50){
strack <- SequenceTrack(org)
anntrack <- list(itrack, axisTrack, strack)
}
else
anntrack <- list(itrack, axisTrack)
if(!is.null(grl))
plotype_length <- length(grl)
else
plotype_length <- 0
if(!is.null(brmeth))
{
brTrack <- list()
bsdat <- NULL
brlist <- list()
if(is.null(names(brmeth)))
stop('names of brmeth cannot be NULL...')
brmeth_trackname <- names(brmeth)
for (i in 1:length(brmeth))
{
bsdat[i] <- unlist(mapBSdata2GRanges(GenoRanges=brmeth_gr, Sample= brmeth[[i]],
context=mcContext))
if(is.na(bsdat[i]))
stop('This genomic range does not have any base resolution methylation data for the given sequence context..')
mc <- mcols(bsdat[[i]])$C/(mcols(bsdat[[i]])$C+ mcols(bsdat[[i]])$T)
mc <- as.matrix(mc)
temp_gr <- bsdat[[i]]
strand(temp_gr) <- "*"
brTrack[[i]] <- DataTrack(range=temp_gr, data=t(mc) , name=brmeth_trackname[i], chromosome=chr,
col.histogram="black", ylim=c(0, 1))
}
plotype_length <- plotype_length+length(brmeth)
}
plottype <- rep("histogram", plotype_length)
#### adding annotation track
if(!is.null(annodata))
{
AnnoTrack <- list()
if(is.null(names(annodata)))
stop('names of annodata cannot be NULL...')
anno_trackname <- names(annodata)
col <- rainbow(length(annodata))
for (i in 1:length(annodata))
{
AnnoTrack[[i]] <- AnnotationTrack(annodata[[i]], name=anno_trackname[i], fill=col[i])
}
}
if(!is.null(transcriptDB))
{
txdb <- transcriptDB
txTrack <- GeneRegionTrack(txdb, chromosome=chr, name="Transcripts", showId=TRUE,
geneSymbol=TRUE, background.panel = "#FFFEDB", background.title = "brown")
}
combtrack <- c(anntrack, dTrack, brTrack, AnnoTrack, txTrack)
plotTracks(combtrack, type=plottype, chromosome=chr, from=start, to=end,
reverseStrand = FALSE, background.panel = "#FFFEDB", background.title = "brown", col="black")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.