# getResultsFromJunctionSeq
# This function is used to get analysis
# results from using JunctionSeq @param dir.name Path name for sample
# information file
# @param sample.file Sample information file
# @param count.file Count file @param gff.file Annotation file
# @param ...Additional parameters(Define them based on runJunctionSeqAnalyses
# function in JuntionSeq)
# @return The analysis result from JunctionSeq R package
# @export
# @examples
#
# dir.name <- system.file('extdata',package='PathwaySplice')
# sample.file <- 'Sample_info.txt'
# count.file <- 'Counts.10.genes.txt'
# gff.file <- 'flat.chr22.10.genes.gff'
#
res <- getResultsFromJunctionSeq(dir.name, sample.file,count.file,gff.file, method.dispFinal = 'shrink',analysis.type = 'exonsOnly')
getResultsFromJunctionSeq <- function(dir.name, sample.file, count.file,
gff.file, ...)
{
# Get sample file
dir.name <- PathwaySplice:::reformatpath(dir.name)
#path.sample.file <- file.path(dir.name, sample.file)
path.sample.file <- sample.file
decoder.bySample <- read.table(path.sample.file, header = TRUE, stringsAsFactors = FALSE)
x <- colnames(decoder.bySample)
sample.ID.index <- which(colnames(decoder.bySample) == x[1])
group.ID.index <- which(colnames(decoder.bySample) == x[2])
Gender.index <- which(colnames(decoder.bySample) == x[3])
# Get count file
path.count.file <- file.path(dir.name, decoder.bySample[, sample.ID.index],
count.file)
# Get annotation file
#path.gff.file <- file.path(dir.name, "GTF_Files", gff.file)
path.gff.file <- gff.file
# Analysis using runJunctionSeqAnalyse, and adjust Gender
jscs <- runJunctionSeqAnalyses(sample.files = path.count.file, sample.names = decoder.bySample[,
sample.ID.index], condition = decoder.bySample[, group.ID.index],
flat.gff.file = path.gff.file, use.covars = decoder.bySample[,
x[3], drop = FALSE], test.formula0 = formula(paste("~ ", paste("sample",
"countbin", paste0(x[3], ":countbin"), sep = "+"))), test.formula1 = formula(paste("~ ",
paste("sample", "countbin", paste0(x[3], ":countbin"), "condition:countbin",
sep = "+"))), effect.formula = formula(paste("~ ", paste("condition",
x[3], "countbin", paste0(x[3], ":countbin"), "condition:countbin",
sep = "+"))), geneLevel.formula = formula(paste("~ ", paste(x[3],
"condition", sep = "+"))), ...)
return(jscs)
}
makeExample <- function(feature.table, num.gene)
{
gene.name <- unique(feature.table$geneID)
if (num.gene <= length(gene.name))
{
x <- sample(gene.name, num.gene)
temp3 <- feature.table[which(feature.table$geneID %in% x), ]
row.names(temp3) <- seq(1, dim(temp3)[1])
return(temp3)
} else
{
cat("Please choose a number that is less or equal to the total number of genes\n")
}
}
heatmap_wPCA = function(Data, g_level = NULL)
{
Data.pca = prcomp(t(Data))
hmcol <- rev(colorRampPalette(brewer.pal(10, "RdBu"))(256))
if (is.null(g_level))
{
type_level = 1:ncol(Data)
col_level = "black"
with(data.frame(Data.pca$x), scatter3D(PC1, PC2, PC3, colvar = NULL, type = "h",
ticktype = "detailed", bty = "b2", cex = 1, xlab = "PC 1", ylab = "PC 2",
zlab = "PC 3", theta = 40, phi = 40, pch = type_level, col = col_level,
main = "Principal component analysis"))
with(data.frame(Data.pca$x), text3D(x = PC1, y = PC2, z = PC3, colnames(Data),
col = "black", add = TRUE, colkey = FALSE, cex = 0.5))
} else
{
type_level = 1:ncol(Data)
TEMP = factor(g_level)
uniq_label = levels(TEMP)
levels(TEMP) = hmcol[ceiling(seq(length.out = length(levels(TEMP)), from = 1,
to = 256))]
col_level = as.character(TEMP)
uniq_col = levels(TEMP)
Data.pca = prcomp(t(Data))
with(data.frame(Data.pca$x), scatter3D(PC1, PC2, PC3, colvar = NULL, type = "h",
ticktype = "detailed", bty = "b2", cex = 1, xlab = "PC 1", ylab = "PC 2",
zlab = "PC 3", theta = 40, phi = 40, pch = type_level, col = col_level,
main = "Principal component analysis"))
legend("topright", legend = uniq_label, pch = type_level, col = uniq_col,
cex = 1, inset = c(0.02))
with(data.frame(Data.pca$x), text3D(x = PC1, y = PC2, z = PC3, colnames(Data),
col = "black", add = TRUE, colkey = FALSE, cex = 0.5))
}
}
getCount4EachBamUsingJobArray <- function(input.bam.dir, input.bam.pattern,
gtffile.gtf, output.file.dir)
{
index <- system("echo $LSB_JOBINDEX", intern = TRUE)
u <- as.integer(index)
if (!dir.exists(output.file.dir))
{
dir.create(output.file.dir, recursive = TRUE)
}
cmd.java.1 = "module load java/1.8.0_60"
cmd.java.2 = "export _JAVA_OPTIONS=\"-Xmx5G\""
cmd.java.3 = "java -jar $HOME/NGS_tools/QoRTs/QoRTs_1.1.8/QoRTs.jar QC --noGzipOutput --keepMultiMapped --stranded"
cmd = paste(cmd.java.1, cmd.java.2, cmd.java.3, sep = ";")
bam.list <- list.files(input.bam.dir, pattern = input.bam.pattern,
all.files = TRUE, recursive = TRUE, full.names = TRUE)
x <- lapply(bam.list, function(u)
{
sample.name <- basename(dirname(u))
cmd1 = paste(cmd, u, gtffile.gtf, file.path(output.file.dir, sample.name),
sep = " ")
cmd1
})
cmd2 <- x[[u]]
cat(cmd2, "\n\n")
system(cmd2)
}
createBsubJobArrayRfun <- function(Rfun, job.name, wait.job.name)
{
x <- useJobArrayOnPegasus("parallel", "72:00", 16, 25000, 8, job.name,
wait.job.name)
xx <- paste(x, paste0("\"R -e ", paste0("'", Rfun, "'"), "\""), sep = " ")
xx
}
useJobArrayOnPegasus <- function(job.option = c("general", "parallel",
"bigmem"), Wall.time, cores, Memory, span.ptile, job.name, wait.job.name = NULL)
{
job.option <- match.arg(job.option)
job.name.array <- job.name
switch(job.option, parallel = {
cmd0 = paste(Wall.time, "-n", cores, "-q parallel -R 'rusage[mem=",
Memory, "] span[ptile=", span.ptile, "]' -u aimin.yan@med.miami.edu",
sep = " ")
}, bigmem = {
cmd0 = paste(Wall.time, "-n", cores, "-q bigmem -R 'rusage[mem=",
Memory, "] span[ptile=", span.ptile, "]' -u aimin.yan@med.miami.edu",
sep = " ")
}, general = {
cmd0 = paste(Wall.time, "-n", cores, "-q general -R 'rusage[mem=",
Memory, "] span[ptile=", span.ptile, "]' -u aimin.yan@med.miami.edu",
sep = " ")
})
if (!is.null(wait.job.name))
{
cmd1 = paste0("bsub -w \"done(\"", wait.job.name, "\")\"", " -P bbc -J \"",
job.name, paste0("\" -o %J.", job.name.array, ".log "), paste0("-e %J.",
job.name.array, ".err -W"))
} else
{
cmd1 = paste0("bsub -P bbc -J \"", job.name, paste0("\" -o %J.",
job.name.array, ".log "), paste0("-e %J.", job.name.array,
".err -W"))
}
cmd = paste(cmd1, cmd0, sep = " ")
return(cmd)
}
# R -e
# 'library(PathwaySplice);PathwaySplice:::processBamFile('/projects/scratch/bbc/Project/Pengzhang_data2015/Alignment_len60','STAR_out.sorted.bam$','~/mus_musculus/Mus_musculus.GRCm38.83.processed.sorted.gtf','/scratch/projects/bbc/aiminy_project/peng_junction')'
# R -e
# 'library(PathwaySplice);PathwaySplice:::processBamFile('/projects/scratch/bbc/Project/Pengzhang_data2015/Alignment_len60','STAR_out.sorted.bam$','~/mus_musculus/Mus_musculus.GRCm38.83.processed.sorted.gtf','/scratch/projects/bbc/aiminy_project/peng_junction/count_strand_based')'
processBamFile <- function(input.bam.dir, input.bam.pattern, gtffile.gtf,
output.file.dir)
{
bam.list <- list.files(input.bam.dir, pattern = input.bam.pattern,
all.files = TRUE, recursive = TRUE, full.names = TRUE)
n <- length(bam.list)
job.name <- paste0("Count[1-", n, "]")
Rfun1 <- "library(PathwaySplice);re <- PathwaySplice:::getCount4EachBamUsingJobArray("
input <- input.bam.dir
input.bam.pattern <- input.bam.pattern
processed.gene.gtf <- gtffile.gtf
output <- output.file.dir
Rfun2 <- ")"
Rinput <- paste0("\\\"", input, "\\\",", "\\\"", input.bam.pattern,
"\\\",", "\\\"", processed.gene.gtf, "\\\",", "\\\"", output, "\\\"")
Rfun <- paste0(Rfun1, Rinput, Rfun2)
counting <- createBsubJobArrayRfun(Rfun, job.name, wait.job.name = NULL)
system(counting)
}
# PathwaySplice:::getResultsFromJunctionSeq2('~/Dropbox (BBSR)/BBSR
# Team
# Folder/Aimin_Yan/peng/count_strand_based','Sample_info.txt','QC.spliceJunctionAndExonCounts.forJunctionSeq.txt','mouse_str.gff','shrink','junctionsAndExons','~/Dropbox
# (BBSR)/BBSR Team
# Folder/Aimin_Yan/peng/count_strand_based/Output_jscs')
getResultsFromJunctionSeq2 <- function(dir.name, sample.file, count.file,
gff.file, method.dispFinal = c("shrink", "max", "fitted", "noShare"),
analysis.type, output.file.dir)
{
if (!dir.exists(output.file.dir))
{
dir.create(output.file.dir, recursive = TRUE)
}
# set up method for calculating dispFinal
method.dispFinal <- match.arg(method.dispFinal)
# Get sample file
dir.name <- reformatpath(dir.name)
path.sample.file <- file.path(dir.name, sample.file)
decoder.bySample <- read.table(path.sample.file, header = TRUE, stringsAsFactors = FALSE)
x <- colnames(decoder.bySample)
sample.ID.index <- which(colnames(decoder.bySample) == x[1])
group.ID.index <- which(colnames(decoder.bySample) == x[2])
# Get count file
path.count.file <- file.path(dir.name, decoder.bySample[, sample.ID.index],
count.file)
# Get annotation file
path.gff.file <- file.path(dir.name, "GTF_Files", gff.file)
jscs <- runJunctionSeqAnalyses(sample.files = path.count.file, sample.names = decoder.bySample[,
sample.ID.index], condition = decoder.bySample[, group.ID.index],
flat.gff.file = path.gff.file, analysis.type = analysis.type, nCores = 1,
verbose = TRUE, debug.mode = TRUE, use.multigene.aggregates = TRUE,
method.dispFinal = method.dispFinal)
save(jscs, file = file.path(output.file.dir, "jscs.RData"))
# return(jscs)
}
# R -e
# 'library(PathwaySplice);PathwaySplice:::submitJob4Jscs('/scratch/projects/bbc/aiminy_project/peng_junction/count_strand_based','Sample_info.txt','QC.spliceJunctionAndExonCounts.forJunctionSeq.txt','Mus_musculus.GRCm38.83.processed.sorted_stranded.gff','shrink','junctionsAndExons','/scratch/projects/bbc/aiminy_project/peng_junction/count_strand_based/Output_jscs')'
submitJob4Jscs <- function(dir.name, sample.file, count.file, gff.file,
method.dispFinal, analysis.type, output.file.dir)
{
job.name <- "RunJscs"
Rfun1 <- "library(PathwaySplice);re <- PathwaySplice:::getResultsFromJunctionSeq2("
input <- dir.name
sample.file <- sample.file
count.file <- count.file
gff.file <- gff.file
method.dispFinal <- method.dispFinal
analysis.type <- analysis.type
output <- output.file.dir
Rfun2 <- ")"
Rinput <- paste0("\\\"", input, "\\\",", "\\\"", sample.file, "\\\",",
"\\\"", count.file, "\\\",", "\\\"", gff.file, "\\\",", "\\\"",
method.dispFinal, "\\\",", "\\\"", analysis.type, "\\\",", "\\\"",
output, "\\\"")
Rfun <- paste0(Rfun1, Rinput, Rfun2)
jscs <- createBsubJobArrayRfun(Rfun, job.name, wait.job.name = NULL)
system(jscs)
}
makeGffFile <- function(input.gtf.file, stranded = c("yes", "no"), out.gff.dir)
{
cmd.java.1 = "module load java/1.8.0_60"
cmd.java.2 = "export _JAVA_OPTIONS=\"-Xmx5G\""
# input.gtf.file <-
# '~/mus_musculus/Mus_musculus.GRCm38.83.processed.sorted.gtf'
input.gtf.file.name <- tools::file_path_sans_ext(basename(input.gtf.file))
stranded <- match.arg(stranded)
switch(stranded, yes = {
cmd.java.3 = "java -jar $HOME/NGS_tools/QoRTs/QoRTs_1.1.8/QoRTs.jar makeFlatGff --stranded"
cmd = paste(cmd.java.1, cmd.java.2, cmd.java.3, sep = ";")
cmd1 <- paste(cmd, input.gtf.file, file.path(out.gff.dir, paste0(input.gtf.file.name,
"_stranded.gff"), sep = " "))
}, no = {
cmd.java.3 = "java -jar $HOME/NGS_tools/QoRTs/QoRTs_1.1.8/QoRTs.jar makeFlatGff"
cmd = paste(cmd.java.1, cmd.java.2, cmd.java.3, sep = ";")
cmd1 <- paste(cmd, input.gtf.file, file.path(out.gff.dir, paste0(input.gtf.file.name,
".gff"), sep = " "))
})
system(cmd1)
}
# R -e
# 'library(PathwaySplice);PathwaySplice:::submitJob4makeGffFile('~/mus_musculus/Mus_musculus.GRCm38.83.processed.sorted.gtf','yes','/scratch/projects/bbc/aiminy_project/peng_junction/count_strand_based/GTF_Files')'
# R -e 'r.lib<- Sys.getenv("R_LIBS_USER");source(file.path(r.lib,"PathwaySplice/bin/AnalysisScript.r"));submitJob4makeGffFile("~/Homo_sapiens.GRCh38.84.gtf","no","~/TestGFF")'
submitJob4makeGffFile <- function(input.gtf.file, stranded, out.gff.dir)
{
if (!dir.exists(out.gff.dir))
{
dir.create(out.gff.dir, recursive = TRUE)
}
job.name <- "processGtf"
Rfun1 <- 'r.lib<- Sys.getenv(\\\"R_LIBS_USER\\\");source(file.path(r.lib,\\\"PathwaySplice/bin/AnalysisScript.r\\\"));proceessGtf4makeGffFile('
Rinput <- paste0("\\\"", input.gtf.file, "\\\",",
"\\\"", out.gff.dir, "\\\"")
Rfun2 <- ")"
Rfun <- paste0(Rfun1, Rinput, Rfun2)
cmd.gff <- createBsubJobArrayRfun(Rfun, job.name, wait.job.name = NULL)
system(cmd.gff)
# proceessGtf4makeGffFile <- function(input.gtf.file,out.gff.dir,use.cluster=NULL)
job.name <- "makeGff"
Rfun1 <- 'r.lib<- Sys.getenv(\\\"R_LIBS_USER\\\");source(file.path(r.lib,\\\"PathwaySplice/bin/AnalysisScript.r\\\"));re <- makeGffFile('
input.gtf.file.name <- tools::file_path_sans_ext(basename(input.gtf.file))
input.gtf.file <- file.path(out.gff.dir, paste0(input.gtf.file.name, "_processed.gtf"))
Rinput <- paste0("\\\"", input.gtf.file, "\\\",", "\\\"", stranded,
"\\\",", "\\\"", out.gff.dir, "\\\"")
Rfun2 <- ")"
Rfun <- paste0(Rfun1, Rinput, Rfun2)
cmd.gff <- createBsubJobArrayRfun(Rfun, job.name, wait.job.name = "processGtf")
system(cmd.gff)
}
adjustBystatistics1 <- function(gene.based.table)
{
z.value <- qchisq(gene.based.table$geneWisePvalue, 1, lower.tail = FALSE)
m <- lm(z.value ~ gene.based.table$numFeature)
z.value.adjusted = mean(z.value) + residuals(m)
p.new <- pchisq(z.value.adjusted, df = 1, lower.tail = FALSE)
gene.based.table$geneWisePvalue <- p.new
gene.based.table
}
adjustBystatistics2 <- function(gene.based.table)
{
z.value <- qchisq(gene.based.table$geneWisePvalue, 1, lower.tail = FALSE)
m <- loess(z.value ~ gene.based.table$numFeature, span = 0.3)
z.value.adjusted = mean(z.value) + residuals(m)
p.new <- pchisq(z.value.adjusted, df = 1, lower.tail = FALSE)
gene.based.table$geneWisePvalue <- p.new
gene.based.table
}
adjustBystatistics3 <- function(gene.based.table, degree.poly)
{
z.value <- qchisq(gene.based.table$geneWisePvalue, 1, lower.tail = FALSE)
m <- lm(z.value ~ poly(gene.based.table$numFeature, degree.poly))
z.value.adjusted = mean(z.value) + residuals(m)
p.new <- pchisq(z.value.adjusted, df = 1, lower.tail = FALSE)
gene.based.table$geneWisePvalue <- p.new
gene.based.table
}
adjustBystatistics4 <- function(gene.based.table, nKnots = 6)
{
nKnots <- round(nKnots)
z.value <- qchisq(gene.based.table$geneWisePvalue, 1, lower.tail = FALSE)
m <- gam(z.value ~ s(gene.based.table$numFeature, k = nKnots, bs = "cr"))
z.value.adjusted = mean(z.value) + residuals(m)
p.new <- pchisq(z.value.adjusted, df = 1, lower.tail = FALSE)
gene.based.table$geneWisePvalue <- p.new
gene.based.table
}
# input.gtf.file <- /media/H_driver/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.gtf
# out.gff.dir <-
#
# proceessGtf4makeGffFile(input.gtf.file,out.gff.dir)
#
proceessGtf4makeGffFile <- function(input.gtf.file,out.gff.dir,use.cluster=NULL)
{
#/media/H_driver/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.gtf
r.lib<- Sys.getenv("R_LIBS_USER")
#Homo_sapiens.GRCh38.84.processed.sorted.22.gtf
input.gtf.file.name <- tools::file_path_sans_ext(basename(input.gtf.file))
cmd1 <- paste("sh",file.path(r.lib,"PathwaySplice/bin/processGtf.sh"),input.gtf.file,file.path(out.gff.dir, paste0(input.gtf.file.name,
"_processed.gtf")), sep = " ")
print(cmd1)
system(cmd1)
}
extractInfoFromGtf <- function() {
en<-ensemblGenome("/Volumes/Bioinformatics$/Aimin_project/GTF_Files")
read.gtf(en,"Homo_sapiens.GRCh38.84.gtf")
en2<-ensemblGenome("/Volumes/Bioinformatics$/Aimin_project/GTF_Files")
read.gtf(en2,"Homo_sapiens.GRCh38.84_processed.gff")
track.0 <- import("/Volumes/Bioinformatics$/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.gtf")
track.00 <- track.0[-which(is.na(track.0$protein_id))]
track.process.gtf <- import("~/Dropbox (BBSR)/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.processed.sorted.2.gtf")
track <- import("/Volumes/Bioinformatics$/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84_processed.gff")
track.exon.only <- track[which(track$type %in% c("exonic_part")),]
mcols(track.exon.only)
track.exon.only.df <- as.data.frame(track.exon.only)
track.exon.only.df.rm.plus <- track.exon.only.df[-grep("\\+",track.exon.only.df$gene_id),]
track.exon.only.df.plus <- track.exon.only.df[grep("\\+",track.exon.only.df$gene_id),]
head(track.exon.only.df.rm.plus)
write.table(track.exon.only.df.rm.plus, file = "~/Dropbox (BBSR)/Aimin_project/GTF_Files/exon.only.df.rm.plus.csv", append = FALSE, quote = FALSE, sep = ",",na = "", dec = ".", row.names = FALSE,col.names = TRUE)
write.table(track.exon.only.df.plus, file = "~/Dropbox (BBSR)/Aimin_project/GTF_Files/exon.only.df.include.plus.csv", append = FALSE, quote = FALSE, sep = ",",na = "", dec = ".", row.names = FALSE,col.names = TRUE)
track.process.gtf.df <- as.data.frame(track.process.gtf)
track.process.gtf.protein.coding.df <- track.process.gtf.df[which(track.process.gtf.df$gene_biotype %in% c("protein_coding")),]
track.process.gtf.non.protein.coding.df <- track.process.gtf.df[-which(track.process.gtf.df$gene_biotype %in% c("protein_coding")),]
num.protein.coding.gene <- length(unique(track.process.gtf.protein.coding.df$gene_id))
num.non.protein.coding.gene <- length(unique(track.process.gtf.non.protein.coding.df$gene_id))
re<-list(gene.without.multimapped=gene.based.without.multimapped$geneID,gene.with.multimapped=jscs.with.multimapped.4$geneID,protein.coding.gene=track.process.gtf.protein.coding.df$gene_id,non.protein.coding.gene=track.process.gtf.non.protein.coding.df$gene_id)
v <- venn.diagram(
x = re,
filename = NULL,
col = "black",
lwd = 1,
fill = c("red","blue","green","turquoise4"),
alpha = c(0.5, 0.5,0.5,0.5),
label.col = c(rep("black",15)),
cex = 1,
fontfamily = "serif",
fontface = "bold",
cat.col = c("red","blue","green","turquoise4"),
cat.cex = 1,
cat.fontfamily = "serif")
png(file.path("~/Dropbox (BBSR)/Aimin_project/Research/PathwaySplice/data/reCountKeepMultiMapped",paste0(paste(names(re),collapse = "_"),"_overlap_venn.png")),width = 1000, height = 600, units = "px")
grid.newpage()
pushViewport(viewport(width=unit(1, "npc"), height = unit(1, "npc")));
grid.draw(v)
dev.off()
track.process.gtf.df.2 <- cbind.data.frame(track.process.gtf.df$gene_id,track.process.gtf.df$protein_id)
track.process.gtf.df.3 <- track.process.gtf.df.2[!duplicated(track.process.gtf.df.2$`track.process.gtf.df$gene_id`), ]
gene.encoding <- track.process.gtf.df.3[-which(is.na(track.process.gtf.df.3$`track.process.gtf.df$protein_id`)),]
gene.non.encoding <- track.process.gtf.df.3[which(is.na(track.process.gtf.df.3$`track.process.gtf.df$protein_id`)),]
track.gene.rm.plus <- track.gene[-grep("\\+",track.gene$gene_id),]
export(track.exon.only,"~/Dropbox (BBSR)/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.exon.only.gff",format = "gff3")
track.gene <- track[which(track$type %in% c("aggregate_gene","exonic_part")),]
export(track.gene.rm.plus,"~/Dropbox (BBSR)/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84_processed_exon.gff",format = "gff3")
track.gene.rm.plus <- track.gene[-grep("\\+",track.gene$gene_id),]
export(track.gene.rm.plus,"~/Dropbox (BBSR)/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84_processed_exon.gff",format = "gff3")
track.1 <- track[-grep("\\+",track$tx_set),]
track.2 <- track[-grep("\\+",track$gene_id),]
track.3 <- track.2[which(track.2$type=="aggregate_gene"),]
track.4 <- track.2[which(track.2$type=="exonic_part"),]
length(unique(track.4$gene_id))
}
# R -e 'r.lib<- Sys.getenv("R_LIBS_USER");source(file.path(r.lib,"PathwaySplice/bin/AnalysisScript.r"));getCount4EachBam("/media/H_driver/Aimin_project/GOSJ_STAR_Bam","STAR_out.sorted.bam$","/media/H_driver/Aimin_project/GTF_Files/Homo_sapiens.GRCh38.84.processed.sorted.2.gtf","java -jar /home/aiminyan/QoRTs/QoRTs.jar QC --keepMultiMapped","/media/aiminyan/DATA/Dropbox (BBSR)/Aimin_project/Research/PathwaySplice/data/reCountKeepMultiMapped")'
getCount4EachBam <- function(input.bam.dir, input.bam.pattern,
gtffile.gtf, cmd, output.file.dir)
{
#index <- system("echo $LSB_JOBINDEX", intern = TRUE)
#u <- as.integer(index)
if (!dir.exists(output.file.dir))
{
dir.create(output.file.dir, recursive = TRUE)
}
# cmd.java.1 = "module load java/1.8.0_60"
#
# cmd.java.2 = "export _JAVA_OPTIONS=\"-Xmx5G\""
#
# cmd.java.3 = "java -jar $HOME/NGS_tools/QoRTs/QoRTs_1.1.8/QoRTs.jar QC --noGzipOutput --keepMultiMapped --stranded"
#
# cmd = paste(cmd.java.1, cmd.java.2, cmd.java.3, sep = ";")
bam.list <- list.files(input.bam.dir, pattern = input.bam.pattern,
all.files = TRUE, recursive = TRUE, full.names = TRUE)
x <- lapply(bam.list, function(u,cmd,gtffile.gtf,output.file.dir)
{
sample.name <- basename(u)
cmd1 = paste(cmd, u, gtffile.gtf, file.path(paste0('"',output.file.dir,'"'),sample.name),
sep = " ")
cmd1
},cmd,gtffile.gtf,output.file.dir)
lapply(1:length(x),function(u,x){
cmd2 <- x[[u]]
cat(cmd2, "\n\n")
system(cmd2)
},x)
}
# input.without.multiplemapped.dir <- "~/Dropbox (BBSR)/Aimin_project/Research/PathwaySplice/ExampleData4paper/SRR1660308_STAR_out.sorted.bam"
#
# input.with.multiplemapped.dir <- "~/Dropbox (BBSR)/Aimin_project/Research/PathwaySplice/data/reCountKeepMultiMapped/SRR1660308_STAR_out.sorted.bam"
#
# input.file.pattern <- "QC.spliceJunctionAndExonCounts.forJunctionSeq.txt"
#
compareTwoCountFiles <- function(input.with.multiplemapped.dir, input.without.multiplemapped.dir, input.file.pattern,
output.file.dir)
{
file.1 <- list.files(input.with.multiplemapped.dir, pattern = input.file.pattern,
all.files = TRUE, recursive = TRUE, full.names = TRUE)
file.2 <- list.files(input.without.multiplemapped.dir, pattern = input.file.pattern,
all.files = TRUE, recursive = TRUE, full.names = TRUE)
count.1 <- read.table(file.1,header = FALSE)
count.2 <- read.table(file.2,header = FALSE)
count.1.2 <- merge.data.frame(count.1,count.2,by="V1",sort = FALSE)
count.1.2.dd <- cbind.data.frame(count.1.2,count.1.2$V2.x-count.1.2$V2.y)
head(count.1.2.dd)
}
useKeepMultiMappedData <- function()
{
load("~/Dropbox (BBSR)/Aimin_project/Research/PathwaySplice/data/reCountKeepMultiMapped/jscsKeepMutiMapped.RData")
res1 <- PathwaySplice:::makeFeatureTable(res, use.multigene.aggregates = TRUE)
res2 <- makeGeneTable(res1)
res3 <- PathwaySplice:::splitGeneCluster(res2)
gene.and.its.cluster <- PathwaySplice:::identifyGeneAndItsCluster(res2)
gene.and.its.cluster.2 <- as.data.frame(gene.and.its.cluster)
row.names(gene.and.its.cluster.2) <- gene.and.its.cluster.2$gene
pwf <- getPwf(res3)
pwf.1 <- merge(pwf,gene.and.its.cluster.2,by=0,sort = FALSE)
row.names(pwf.1) <-pwf.1$Row.names
pwf.2 <- pwf.1[,which(colnames(pwf.1) %in%c("DEgenes","bias.data","pwf","geneCluster"))]
}
# test PathwaySplice using all gene table(all.gene.table)
#
dir <- system.file("extdata", package="PathwaySplice")
all.gene.table <- readRDS(file.path(dir, "AllGeneTable.rds"))
res.adj <- runPathwaySplice(all.gene.table,
genome='hg19',
id='ensGene',
test.cats = "GO:BP",
go.size.limit = c(5, 30),
method='Wallenius')
res.unadj <- runPathwaySplice(all.gene.table,
genome='hg19',
id='ensGene',
test.cats = "GO:BP",
go.size.limit = c(5, 30),
method='Hypergeometric')
compareResults(20, res.adj, res.unadj, all.gene.table, type.boxplot='Only3', output.dir = "~/OutputTestPathwaySplice_all_gene_GO_BP_2/")
perGeneQValueExact = function(pGene, theta, geneSplit) {
stopifnot(length(pGene)==length(geneSplit))
## Compute the numerator \sum_{i=1}^M 1-(1-theta)^{n_i}
## Below we first identify the summands which are the same
## (because they have the same n_i), then do the sum via the
## mapply
numExons = listLen(geneSplit)
tab = tabulate(numExons)
notZero = (tab>0)
numerator = mapply(function(m, n) m * (1 - (1-theta)^n),
m = tab[notZero],
n = which(notZero))
numerator = rowSums(numerator)
## Compute the denominator: for each value of theta, the number
## of genes with pGene <= theta[i].
## Note that in cut(..., right=TRUE), the intervals are
## right-closed (left open) intervals.
bins = cut(pGene, breaks=c(-Inf, as.vector(theta)), right = TRUE, include.lowest = TRUE)
counts = tabulate(bins, nbins = nlevels(bins))
denom = cumsum(counts)
stopifnot(denom[length(denom)]==length(pGene))
return(numerator/denom)
}
perGeneQValue <- function (object, p = "pvalue", method = perGeneQValueExact)
{
#stopifnot(is(object, "DEXSeqResults"))
wTest <- which(!is.na(object$padj))
pvals = object[[p]][wTest]
geneID = factor(object[["groupID"]][wTest])
geneSplit = split(seq(along = geneID), geneID)
# identify min feature p value for each gene
pGene = sapply(geneSplit, function(i) min(pvals[i]))
stopifnot(all(is.finite(pGene)))
theta = unique(sort(pGene))
#pGene: smallest p value among features within one gene
#theta: unique and sorted smallest p value
#geneSplit: gene with feature index
q = method(pGene, theta, geneSplit)
res = rep(NA_real_, length(pGene))
res = q[match(pGene, theta)]
# note res > 1, so select the value between 0 and 1
res = pmin(1, res)
names(res) = names(geneSplit)
stopifnot(!any(is.na(res)))
return(res)
}
object=dxr.2
wTest <- which(!is.na(object$padj))
pvals = object[["pvalue"]][wTest]
geneID = factor(object[["groupID"]][wTest])
geneSplit = split(seq(along = geneID), geneID)
perGeneQValueExact(pGene, theta, geneSplit)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.