## show a list of available organisms from UCSC database
ShowAvailableOrg <- function() {
data(Organisms)
}
GetPromoterAnno <- function(organism, save = FALSE, Dir = NULL) {
ShowAvailableOrg()
if (!organism %in% Organisms[,2])
stop(
'Organism not found. Please call funcition ShowAvailabelOrg() to retrive a list of available organisms from UCSC. Remember to use accession code rather than the organism name directly.'
)
ucsc_db <-
src_mysql(organism, 'genome-mysql.cse.ucsc.edu', user = 'genome')
Ref_Flat <-
tbl(ucsc_db, sql("SELECT geneName, chrom, strand, txStart, txEnd FROM refFlat"))
Promoter_Anno <- GetPromoters(as.data.frame(Ref_Flat))
if (save) {
if (is.null(Dir))
stop('please provide a path where annotation file will be saved')
write.table(
Promoter_Anno, paste(Dir, '/Promoter_Anno.bed', sep = ''), quote = F, sep = '\t', row.names = F
)
}
return(Promoter_Anno)
}
bedtoolsr <- function(bam = NULL, bamdir = NULL, bed) {
if (is.null(bamdir)) {
if (is.null(bam) | !file.exists(bam))
stop('bam file not provided or can not be found.')
else
bamfiles <- bam
}
else
bamfiles <- grep('.bam', dir(bamdir), value = TRUE)
if (!length(bamfiles))
stop('There are no bam files in the directory')
else{
bamfiles <- paste(bamdir, '/', bamfiles, sep = '')
temp1 <- c()
for (i in 1:length(bamfiles)) {
command <-
paste('coverageBed -abam', bamfiles[i], '-b', bed, sep = ' ')
cat('processing: ', bamfiles[i], '\n')
temp <- fread(input = command, data.table = FALSE)
temp <- arrange(temp, V1, V2, V3)
temp1 <- bind_cols2(temp1, temp[6])
}
temp1 <- as.data.frame(temp1)
colnames(temp1) <- paste('S', 1:ncol(temp1), sep = '')
temp1$name <- temp$V4
temp <- arrange(temp, name)
return(list(count = temp1,
gaplength = temp[,(ncol(temp) - 1)]))
}
}
bind_cols2 <- function(a, b){
if(!length(a) | !length(b)){
if(!length(a))
return(b)
if(!length(b))
return(a)
}
else
return(bind_cols(a, b))
}
GetPromoters <-
function(uctable, upstream = 2000, downstream = 2000) {
uctable <- uctable %>%
filter(sapply(strsplit(chrom, '_'), length) == 1)
uniqid <- uctable %>%
select(geneName) %>%
distinct(geneName)
### A single gene might encode transcript on different chromsome,
### strand or transcriptional start site
uctable1 <- data.frame()
for (i in 1:length(uniqid$geneName)) {
cat(i, '\n')
index <-
grep(paste('^', uniqid$geneName[i], '$', sep = ''), uctable$geneName)
temp <- unique(uctable$chrom[index])
temp1 <- unique(uctable$strand[index])
for (j in 1:length(temp)) {
for (k in 1:length(temp1)) {
index1 <-
which(uctable[index,2] == temp[j] & uctable[index, 3] == temp1[k])
if (length(index1)) {
if (temp1[k] == '+')
temp2 <- unique(uctable[index[index1],4])
else
temp2 <- unique(uctable[index[index1],5])
temp3 <-
data.frame(
Symbol = paste(uniqid$geneName[i], '#', 1:length(temp2), sep = ''),
Chrom = rep(temp[j], length(temp2)),
Strand = rep(temp1[k], length(temp2)),
TSS = temp2, stringsAsFactors = FALSE
)
uctable1 <- rbind(uctable1, temp3)
}
}
}
}
Promoter4k <- data.frame(
chrom = uctable1$Chrom,
chromStart = ifelse(
uctable1$Strand == '+', uctable1$TSS - upstream, uctable1$TSS - downstream
),
chromEnd = ifelse(
uctable1$Strand == '+', uctable1$TSS + downstream, uctable1$TSS + upstream
),
name = uctable1$Symbol,
Strand = uctable1$Strand
)
Promoter4k <- Promoter4k[!duplicated(Promoter4k$name),]
Promoter4k$chromStart[Promoter4k$chromStart < 0] <- 0
return(Promoter4k)
}
## retrive chromsome length information from UCSC database.
GetChromLength <- function(organism) {
cat('Retriving chromsome length information from UCSC database', '\n')
ShowAvailableOrg()
if (!organism %in% Organisms[,2])
stop(
'Organism not found. Please call funcition ShowAvailabelOrg() to retrive a list of available organisms from UCSC. Remember to use accession code rather than the organism name directly.'
)
ucsc_db <-
src_mysql(organism, 'genome-mysql.cse.ucsc.edu', user = 'genome')
Ref_Flat <- tbl(ucsc_db, sql("SELECT chrom, size FROM chromInfo"))
}
#### create window function to create 100bp window across whole genome
CreateWindows <- function(chromlen, binsize) {
cat(paste("Generate ", binsize, " bp windows across the whole genome", sep = ''), "\n")
Wholegenome_bin <-
foreach (
i = 1:length(chromlen$size), .options.multicore = list(preschedule = FALSE),
.combine = bind_rows, .inorder = TRUE, .verbose =
TRUE,
.errorhandling = 'stop', .multicombine =
TRUE
) %dopar% {
cat(i, '\t')
bin <-
seq(0, chromlen$size[i], by = binsize)
temp <-
data.frame(
chrom = rep(chromlen$chrom[i], length(bin)),
chromStart = bin,
chromEnd = c(bin[2:length(bin)], chromlen$size[i])
)
temp
}
return(Wholegenome_bin)
}
########### exclude regions: 1. in promoter regions, 2. in CpG island regions 3. ATCG content is not 100%
Getfasta <- function(fa, bed) {
cat("Building sequence file for specified binsize", "\n")
if (is.null(fa) | !file.exists(fa))
stop('Sequence file not provided or cannot be found')
if (is.null(bed) | !file.exists(bed))
stop('Bed file not provided or cannot be found')
command <-
paste('bedtools getfasta -fi', fa, '-bed', bed, '-fo', gsub('bed$', 'fa', bed), sep = ' ')
system(command)
}
ConstructExRegions <- function(organism, promoter_anno, CpG = TRUE) {
cat("Construct regions to exclude when building background noise", "\n")
promoter_anno <-
select(promoter_anno, chrom, chromStart, chromEnd)
if (CpG) {
ShowAvailableOrg()
if (!organism %in% Organisms[,2])
stop(
'Organism not found. Please call funcition ShowAvailabelOrg() to retrive a list of available organisms from UCSC. Remember to use accession code rather than the organism name directly.'
)
ucsc_db <-
src_mysql(organism, 'genome-mysql.cse.ucsc.edu', user = 'genome')
CpGisland <-
as.data.frame(tbl(
ucsc_db, sql(
"SELECT chrom, chromStart, chromEnd FROM cpgIslandExtUnmasked"
)
))
colnames(promoter_anno) <-
colnames(CpGisland) <- c('chrom', 'start', 'end')
bind_rows(promoter_anno, as.data.frame(CpGisland))
}
else
promoter_anno
}
FilterRegions <- function(bed, fasta, Exbed) {
cat("Build preliminary background annotation", "\n")
Exclude_index <- ExcludeIntersection(bed, Exbed)
rm(Exbed)
letter_freq <- CountFreqency(fasta)
Exclude_index1 <- which(letter_freq$ATCG != 1)
Exclude_index_all <-
Reduce(dplyr::union, list(Exclude_index, Exclude_index1))
rm(list = c("Exclude_index", "Exclude_index1"))
Filter_regions <-
bind_cols(bed[-Exclude_index_all,], letter_freq[-Exclude_index_all,][1])
rm(list = c("Exclude_index_all", "bed", "letter_freq"))
colnames(Filter_regions) <-
c('chrom', 'chromStart', 'chromEnd', 'GC')
return(Filter_regions)
}
CountFreqency <- function(fasta, CG = TRUE, ATCG = TRUE) {
fasta_file <- readDNAStringSet(fasta,use.names = TRUE)
Freqmat <- data.frame(matrix(NA, nrow = length(fasta_file), ncol = 2))
colnames(Freqmat) <- c('CG', 'ATCG')
if (CG)
Freqmat$CG <-
letterFrequency(fasta_file, letters = 'CG', as.prob = TRUE)[,1]
if (ATCG)
Freqmat$ATCG <-
letterFrequency(fasta_file, letters = 'ATCG', as.prob = TRUE)[,1]
return(Freqmat)
}
########## Exclude intersection between regions
ExcludeIntersection <-
function(data1, data2) {
if (ncol(data1) != ncol(data2))
stop('The two datasets must have equal column size.')
binsize <- data1[1,3] - data1[1,2]
foreach (
i = 1:nrow(data2), .options.multicore = list(preschedule = FALSE),
.combine = c, .inorder = TRUE, .verbose = TRUE,
.errorhandling = 'stop', .multicombine = TRUE
) %dopar% {
cat(i, '\t')
index <- which(data1[,1] == data2[i,1])
index1 <- which(data1[index,2] + binsize > data2[i,2] & data1[index,3] + binsize < data2[i,3])
if (length(index1))
index[index1]
}
}
######### identify background region for each promoter and calculate background counts for each promoter
### for each gene, get 40 100bp windows that have lowest average rpkm, then add the count
### together as the background estimation for this gene
### all the genes have at least 4k up or down stream
### cannot find enough windows in -4k + 4k windows
### will try to find closest 80 windows (GC < 0.4), and choose 40 among them based on methylation levels
IdentifyBackground <-
function(organism, bed_path, binsize = 100, promo_bed, fa) {
chromlen <- as.data.frame(GetChromLength(organism))
chromlen <- chromlen[which(sapply(strsplit(chromlen$chrom, "_"), length) ==1),]
bed_100bp <- as.data.frame(CreateWindows(chromlen, binsize))
rm(chromlen)
bed_path <- paste(bed_path, 'bed_100bp.bed', sep = '')
storage.mode(bed_100bp[,2]) <-
storage.mode(bed_100bp[,3]) <- 'integer'
cat("Write bed file to disk", "\n")
write.table(
bed_100bp, bed_path, quote = FALSE, sep = '\t', row.names = FALSE, col.names = FALSE
)
Exclude_regions <-
as.data.frame(ConstructExRegions(organism, promo_bed))
Getfasta(fa, bed_path)
bed_100bp_filtered <-
FilterRegions(bed_100bp, gsub('bed$', 'fa', bed_path), Exclude_regions)
rm(Exclude_regions)
Background_region <-
foreach (
i = 1:nrow(promo_bed), .options.multicore = list(preschedule = FALSE),
.combine = bind_rows, .inorder = TRUE, .verbose =
TRUE,
.errorhandling = 'stop', .multicombine =
TRUE
) %dopar% {
cat('promoter', i, '\t')
preset <-
filter(bed_100bp_filtered, chrom == promo_bed$chrom[i])
preset <-
mutate(
preset, proximity = abs(chromStart - promo_bed$chromStart[i]),
geneName = promo_bed$name[i]
)
preset <-
arrange(preset, proximity)
k <- 0
bgregion <- c()
for (j in 1:nrow(preset)) {
if (preset[j,4] < 0.4) {
k <- k + 1
bgregion <-
bind_rows(bgregion, preset[j,])
}
if (k == 80)
break
}
bgregion
}
return(Background_region)
}
CalculateTPM <- function(dataMat, gaplength) {
if (nrow(dataMat) != length(gaplength))
stop(
'The row dimension of the dataset has to equal to the length of gap length. Remember to use accession code rather than the organism name directly.'
)
libsize <- apply(dataMat, 2, sum)
gaplength <- t(pracma::repmat(gaplength, ncol(dataMat), 1))
dataMat_TPM <- (dataMat / gaplength) * 10 ^ 6
return(apply(dataMat_TPM, 1, mean))
}
Summerizebg <- function(dataMat_bg, gaplength) {
dataMat_bg$TPM <- CalculateTPM(dataMat_bg[, 1:(ncol(dataMat_bg) -1) ], gaplength)
dataMat_bg <- dataMat_bg %>%
group_by(name) %>%
arrange(TPM) %>%
dplyr::slice(1:40)
dataMat_bg <- dataMat_bg %>%
group_by(name) %>%
summarise_each(funs(sum)) %>%
arrange(name) %>%
select(-TPM)
}
MBDDiff <-
function(promoter, background, conditions, method = "pooled",
sharingMode = "maximum", fitType = "local", pvals_only = FALSE, paraMethod =
'NP') {
conditions <- as.factor(conditions)
rownames(promoter) <- promoter$name
promoter <- as.data.frame(promoter %>% select(-name))
rownames(background) <- background$name
background <- as.data.frame(background %>% select(-name))
commonGenes <- intersect(rownames(promoter), rownames(background))
promoter <- promoter[rownames(promoter) %in% commonGenes, ]
background <- background[rownames(background) %in% commonGenes,]
MBD <- XBSeqDataSet(promoter, background, conditions)
MBD <- estimateRealCount(MBD)
MBD <- estimateSizeFactors(MBD)
MBD <-
estimateSCV(MBD, method = method, sharingMode = sharingMode, fitType =
fitType)
Teststas <-
XBSeqTest(MBD, levels(conditions)[1L], levels(conditions)[2L], method =
paraMethod)
return(list(MBD, Teststas))
}
######### GC enrichment test by yidong's matlab program
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.