getFisher <- function(counts) {
# treatment_pre treatment_post control_pre control_post
mat <- matrix(counts, ncol=2, byrow=FALSE)
# Reminders:
#colnames(mat) <- c('treatment', 'control')
#rownames(mat) <- c('PRE', 'POST')
f <- fisher.test(as.table(mat), alternative="two.sided")
return(f$p.value)
}
combineFisherMethod <- function(pvals) {
# Computes the combined pvalue from a list of independent tests resulting pvalues
# following the fisher method.
# http://en.wikipedia.org/wiki/Fisher%27s_method
chisq <- -2*sum(log(pvals))
return(1-pchisq(chisq, df = 2*length(pvals)))
}
checkStep <- function(rds, neededStep) {
if (rds@step > neededStep) { # Already done, not repeating. Give a reset method?
warning("Not repeating a step already done")
return(c(FALSE, rds)) # We hope for promotion.
} else if (rds@step < neededStep) { # Something is missing.
if (neededStep != rds@step) {
if (neededStep == 1) {
warning("Automatically calling countPrePost")
rds <- countPrePost(rds, FALSE)
} else if (neededStep == 2) {
warning("Automatically calling computeRoars")
rds <- computeRoars(rds)
} else if (neededStep == 3) {
warning("Automatically calling computePvals")
rds <- computePvals(rds)
}
}
}
return(c(TRUE, rds))
}
meanAcrossAssays <- function(assays, wantedColumns) {
# Is the conversion to dataframe slow? Is there a more efficient way without
# changing the SE/Assays structure?
wantedCols <- lapply(assays, function(x) { x[,wantedColumns] } )
return(rowMeans(as.data.frame(wantedCols)))
}
getApaGenesFractions <- function(geneGr, apaGr)
{
strand <- unique(as.character(strand(geneGr)))
chr <- unique(as.character(seqnames(geneGr)))
gene_id <- unique(as.character(mcols(geneGr)$gene))
if (length(chr) != 1) {
stop("A gene is on different chrs")
}
if (length(gene_id) != 1) {
stop("A gene has different names")
}
if (strand == '+') {
getApaGenesFractionsPlusStrand(geneGr, apaGr, chr, strand, gene_id)
} else if (strand == '-') {
getApaGenesFractionsMinusStrand(geneGr, apaGr, chr, strand, gene_id)
} else {
stop("A gene has no strand info or both + and -")
}
}
getApaGenesFractionsPlusStrand <- function(geneGr, apaGr, chr, strand, gene_id)
{
geneGr <- sort(geneGr)
apaGr <- sort(apaGr)
seqlengths(seqinfo(geneGr))<-rep(NA, length(seqinfo(geneGr)))
introns <- gaps(geneGr)
# Remove first intron: it is not useful in any way.
# TODO: if there are chr lenghts we will get also a last intron? Yes. Bad fix: remove them.
introns <- tail(introns, n=length(introns)-1) # TODO add check to avoid removing introns if first exon begin == 0/1
fakeApaEndGene <- GRanges(
seqnames = chr,
strand = strand,
ranges = IRanges(
start = end(tail(geneGr, n=1)),
width = 1,
names = ""
)
)
mcols(fakeApaEndGene) <- mcols(apaGr)[1,]
mcols(fakeApaEndGene)[1,"apa"] <- ""
apaGr <- c(apaGr, fakeApaEndGene)
apaFragmentsPrePost <- vector("list", length(apaGr)-1)
apaFrI <- 1
# We start by doing it iteratively and non R/Bioc stylishly because I've got
# no idea how.
# New idea: we combine exons and introns adding them mcols to their kind
# and with info about being overlappant or not.
# Then we run over this GRanges that should have all the needed info
# to generate a GRanges res with intervals and maybe also a list of
# pre/post indexes for every APA.
mcols(geneGr) <- NULL
mcols(geneGr)$type <- 'e'
if (length(introns) != 0) {
mcols(introns)$type <- 'i'
}
whole <- sort(c(geneGr, introns))
hits <- findOverlaps(whole, apaGr)
if (!all(countSubjectHits(hits) == 1)) {
stop("Error: a given apa does not overlap its gene or an apa is not of width 1")
}
foundov <- length(unique(whole[queryHits(hits)]))
mcols(whole)$overlap <- rep(FALSE, length(whole))
mcols(whole[unique(queryHits(hits))])$overlap <- rep(TRUE, foundov)
# Growing objects sin.
begins <- c()
ends <- c()
begin <- -1
lastExonB <- -1
# Write logic and invariant here: begin is always the beginning of the last
# needed fragment (i.e. start of exon with an overlap or with an overlap in
# the next intron or just seen apa). We add an end when
# we get an exon/intron with overlapping apas, and a begin for exons
# with overlap or preceding an overlapping intron.
for (i in 1:length(whole)) {
if (whole[i]$type=='e') {
if (whole[i]$overlap || (i+1<=length(whole) && whole[i+1]$overlap)) {
if (begin != -1) {
begins <- c(begins, begin)
ends <- c(ends, end(whole[i-1]))
}
begin <- start(whole[i])
lastExonB <- length(begins)+1
}
}
if (whole[i]$overlap) {
overlapping_apas <- apaGr[subjectHits(hits[queryHits(hits)==i])]
# Here we have a shot of writing the pre portions of counts
# for each APA.
for (j in 1:length(overlapping_apas)) {
begins <- c(begins, begin)
ends <- c(ends, start(overlapping_apas[j]))
# We add 1 to avoid overlapping PRE/POST fragments
# - APA are considered before the cut.
# In the previous gtf we skipped the "cut" base altogether.
begin <- end(overlapping_apas[j])+1
firstApa_name <- unlist(
strsplit(mcols(overlapping_apas[j])$apa, '_', fixed=TRUE))[1]
apa_name <- firstApa_name
apaFragmentsPrePost[[apaFrI]] <- new("ApaFragmentPrePost",
name=apa_name,
PREstart=lastExonB,
PREend=length(begins))
apaFrI <- apaFrI+1
}
}
}
fragments <- GRanges(seqnames=chr, strand=strand,
ranges=IRanges(start=begins, end=ends))
ovHits <- findOverlaps(fragments, geneGr)
ovLen <- width(overlapsRanges(ranges(fragments), ranges(geneGr), hits=ovHits))
mcols(fragments)$length <- rep(0, length(fragments))
# A single fragment can overlap multiple exons and we need to sum here their lengths:
# i.e. ovlen is longer than length(fragments), with queryHits we get the right reps but
# not sums. The only way I am able to imagine a solution is with a for.
indexes_fr <- queryHits(ovHits)
for (k in 1:length(ovLen)) {
mcols(fragments[indexes_fr[k]])$length <- mcols(fragments[indexes_fr[k]])$length + ovLen[k]
}
# The last apaFragment is build from a single apa (end of gene) only and we don't want it
#fragments <- head(fragments, n=length(fragments)-1)
# But the fragment is ok!
apaFragmentsPrePost <- head(apaFragmentsPrePost,
n=apaFrI-2)
return(list(fragments, apaFragmentsPrePost))
}
# Testing will be hideous.
# Strand -. We are being not DRY but i prefer to start in this way and refactor
# after the first runs/tests. During active development I will have to remember
# to change code (if needed) in two places...but right now I'm not sure
# about how many differences I need to insert in the code.
getApaGenesFractionsMinusStrand <- function(geneGr, apaGr, chr, strand, gene_id)
{
geneGr <- sort(geneGr, decreasing=TRUE)
apaGr <- sort(apaGr, decreasing=TRUE)
introns <- sort(gaps(geneGr), decreasing=TRUE)
seqlengths(seqinfo(geneGr))<-rep(NA, length(seqinfo(geneGr)))
# Remove last intron: it is not useful in any way.
introns <- head(introns, n=length(introns)-1)
fakeApaEndGene <- GRanges(
seqnames = Rle(c(chr)),
strand = strand,
ranges = IRanges(
start= start(tail(geneGr, n=1)),
width= 1,
names = ""
)
)
mcols(fakeApaEndGene) <- mcols(apaGr)[1,]
mcols(fakeApaEndGene)[1,"apa"] <- ""
apaGr <- c(apaGr, fakeApaEndGene)
apaFragmentsPrePost <- vector("list", length(apaGr)-1)
apaFrI <- 1
# We start by doing it iteratively and non R/Bioc stylishly because I've got
# no idea how.
# New idea: we combine exons and introns adding them mcols to their kind
# and with info about being overlappant or not.
# Then we run over this GRanges that should have all the needed info
# to generate a GRanges res with intervals and maybe also a list of
# pre/post indexes for every APA.
mcols(geneGr) <- NULL
mcols(geneGr)$type <- 'e'
if (length(introns) != 0) {
mcols(introns)$type <- 'i'
}
whole <- sort(c(geneGr, introns), decreasing=TRUE)
hits <- findOverlaps(whole, apaGr)
if (!all(countSubjectHits(hits) == 1)) {
stop("Error: a given apa does not overlap its gene or an apa is not of width 1")
}
foundov <- length(unique(whole[queryHits(hits)]))
mcols(whole)$overlap <- rep(FALSE, length(whole))
mcols(whole[unique(queryHits(hits))])$overlap <- rep(TRUE, foundov)
# Growing objects sin.
begins <- c()
ends <- c()
begin <- -1
lastExonB <- -1
# Write logic and invariant here: begin is always the beginning of the last
# needed fragment (i.e. start of exon with an overlap or with an overlap in
# the next intron or just seen apa). We add an end when
# we get an exon/intron with overlapping apas, and a begin for exons
# with overlap or preceding an overlapping intron.
for (i in 1:length(whole)) {
if (whole[i]$type=='e') {
if (whole[i]$overlap || (i+1<=length(whole) && whole[i+1]$overlap)) {
if (begin != -1) {
begins <- c(begins, begin)
ends <- c(ends, start(whole[i-1]))
}
begin <- end(whole[i])
lastExonB <- length(begins)+1
}
}
if (whole[i]$overlap) {
overlapping_apas <- apaGr[subjectHits(hits[queryHits(hits)==i])]
overlapping_apas <- sort(overlapping_apas, decreasing=TRUE)
# Here we have a shot of writing the pre portions of counts
# for each APA.
for (j in 1:length(overlapping_apas)) {
begins <- c(begins, begin)
ends <- c(ends, start(overlapping_apas[j]))
# We add 1 to avoid overlapping PRE/POST fragments
# - APA are considered before the cut.
# In the previous gtf we skipped the "cut" base altogether.
begin <- end(overlapping_apas[j])-1
firstApa_name <- unlist(
strsplit(mcols(overlapping_apas[j])$apa, '_', fixed=TRUE))[1]
apa_name <- firstApa_name
apaFragmentsPrePost[[apaFrI]] <- new("ApaFragmentPrePost",
name=apa_name,
PREstart=lastExonB,
PREend=length(begins))
apaFrI <- apaFrI+1
}
}
}
# here begins and ends are to be switched.
fragments <- GRanges(seqnames=chr, strand=strand,
ranges=IRanges(start=ends, end=begins))
ovHits <- findOverlaps(fragments, geneGr)
ovLen <- width(overlapsRanges(ranges(fragments), ranges(geneGr), hits=ovHits))
mcols(fragments)$length <- rep(0, length(fragments))
# A single fragment can overlap multiple exons and we need to sum here their lengths:
# i.e. ovlen is longer than length(fragments), with queryHits we get the right reps but
# not sums. The only way I am able to imagine a solution is with a for.
indexes_fr <- queryHits(ovHits)
for (k in 1:length(ovLen)) {
mcols(fragments[indexes_fr[k]])$length <- mcols(fragments[indexes_fr[k]])$length + ovLen[k]
}
#mcols(fragments[queryHits(ovHits)])$length <- ovLen # But for length is here XXX FIXME
# The last apaFragment is build from a single apa only and we don't want it
#fragments <- head(fragments, n=length(fragments)-1)
# But the fragment is ok!
apaFragmentsPrePost <- head(apaFragmentsPrePost,
n=apaFrI-2)
return(list(fragments, apaFragmentsPrePost))
}
obtainPrePost <- function(prepost, fragments)
{
sn <- unique(seqnames(fragments))
strand <- unique(strand(fragments))
# We have already checked about strand uniqueness.
gene_id=c(paste0(prepost@name, "_PRE"),
paste0(prepost@name, "_POST"))
# We do not really need the right coords, but still: I was fool.
# We need them to correct in the right way for lengths!
if (strand == "+") {
# This logic does not work for minus strand.
res <- GRanges(seqnames=rep(sn,2), # 22% of the time here?
strand=rep(strand,2),
ranges=IRanges(start=c(start(fragments[prepost@PREstart]),
start(fragments[prepost@PREend+1])),
end=c(end(fragments[prepost@PREend]),
end(tail(fragments, n=1)))))
length <- c(sum(mcols(fragments[seq(prepost@PREstart, prepost@PREend)])$length),
sum(mcols(fragments[seq(prepost@PREend+1, length(fragments))])$length))
} else { # if (strand == "-") { # and also only + and -
res <- GRanges(seqnames=rep(sn,2), # 22% of the time here?
strand=rep(strand,2),
ranges=IRanges(start=c(start(fragments[prepost@PREend]),
start(tail(fragments, n=1))),
end=c(end(fragments[prepost@PREstart]),
end(fragments[prepost@PREend+1]))))
length <- c(sum(mcols(fragments[seq(prepost@PREstart, prepost@PREend)])$length),
sum(mcols(fragments[seq(prepost@PREend+1, length(fragments))])$length))
}
mcols(res) <- DataFrame(gene_id, length)
return(res)
}
sumFragmentCounts<- function(prepost, counts, kind)
{
if (kind == "pre") {
return(sum(counts[c(seq(prepost@PREstart,prepost@PREend, by=1))]))
} else if (kind == "post") {
return(sum(counts[c(seq(prepost@PREend+1,length(counts), by=1))]))
} else {
stop ("Inner error in helper functions to sum counts")
}
}
createRoarSingleBam <- function(name, mulRds, treatmentSE, controlSE)
{
prePostDef <- mulRds@prePostDef[[name]]
fragments <- mulRds@fragments[[name]]
#prePostCoords = "GRanges",
#postCoords = "GRanges",
#countsTreatment = "RangedSummarizedExperiment",
#countsControl = "RangedSummarizedExperiment",
prePostCoords <- do.call(c, sapply(prePostDef, obtainPrePost, fragments))
# We need to extract the right values from treatmentSE/controlSE, should
# clearly be more efficient than looking for the coords in fragments.
# assays(treatmentSE,1)$counts[rownames(assays(treatmentSE,1)$counts)=="10771"]
postElems <- grep("_POST$", mcols(prePostCoords)$gene_id)
preElems <- grep("_PRE$", mcols(prePostCoords)$gene_id)
preCoords <- prePostCoords[preElems,]
rds <- new("RoarDataset",
SummarizedExperiment(rowRanges=preCoords,
colData=DataFrame(row.names=c("treatment_pre","treatment_post","control_pre", "control_post"))),
prePostCoords=prePostCoords)
rds@postCoords <- rds@prePostCoords[postElems,]
se <- SummarizedExperiment(assays = rep(list(matrix(nrow=length(rds@prePostCoords)/2, ncol=4)),2),
rowRanges=preCoords,
colData=DataFrame(row.names=c("treatment_pre","treatment_post","control_pre", "control_post"))
)
# Other than assay rownames we could have used:
#> rownames(assays(pada,1)$counts)
#[1] "100996928" "100996928" "101927572" "101927572" "104909134" "104909134"
#[7] "9" "9" "10" "10" "12" "12"
#> names(rowRanges(pada))
#[1] "100996928" "100996928" "101927572" "101927572" "104909134" "104909134"
#[7] "9" "9" "10" "10" "12" "12"
#treatment <- assays(treatmentSE,1)$counts[rownames(assays(treatmentSE,1)$counts)==name]
# > assays(pada,1)$counts[names(rowRanges(pada))=="12"]
# [1] 0 0
# But which is faster?
control <- assays(controlSE,1,withDimnames=TRUE)$counts[rownames(assays(controlSE,1,withDimnames=TRUE)$counts)==name]
treatment <- assays(treatmentSE,1,withDimnames=TRUE)$counts[rownames(assays(treatmentSE,1,withDimnames=TRUE)$counts)==name]
assay(se,1, withDimnames=TRUE)[,"treatment_pre"] <- sapply(prePostDef, sumFragmentCounts, treatment, "pre")
assay(se,1, withDimnames=TRUE)[,"treatment_post"] <- sapply(prePostDef, sumFragmentCounts, treatment, "post")
assay(se,1, withDimnames=TRUE)[,"control_pre"] <- sapply(prePostDef, sumFragmentCounts, control, "pre")
assay(se,1, withDimnames=TRUE)[,"control_post"] <- sapply(prePostDef, sumFragmentCounts, control, "post")
rowRanges(rds) <- rowRanges(se)
colData(rds) <- colData(se)
assays(rds, withDimnames=TRUE) <- assays(se, withDimnames=TRUE)
names(assays(rds,withDimnames=TRUE)) <- "counts"
rds@step <- 1
# length should be preserved: XXX multipleSamples!
rds@treatmentBams <- list(NA)
rds@controlBams <- list(NA)
rds@cores <- 1
return(rds)
}
createRoarMultipleBam <- function(name, mulRds, treatmentSE, controlSE)
{
prePostDef <- mulRds@prePostDef[[name]]
fragments <- mulRds@fragments[[name]]
#prePostCoords = "GRanges",
#postCoords = "GRanges",
#countsTreatment = "RangedSummarizedExperiment",
#countsControl = "RangedSummarizedExperiment",
prePostCoords <- do.call(c, sapply(prePostDef, obtainPrePost, fragments))
# We need to extract the right values from treatmentSE/controlSE, should
# clearly be more efficient than looking for the coords in fragments.
# assays(treatmentSE,1)$counts[rownames(assays(treatmentSE,1)$counts)=="10771"]
postElems <- grep("_POST$", mcols(prePostCoords)$gene_id)
preElems <- grep("_PRE$", mcols(prePostCoords)$gene_id)
preCoords <- prePostCoords[preElems,]
rds <- new("RoarDataset",
SummarizedExperiment(rowRanges=preCoords,
colData=DataFrame(row.names=c("treatment_pre","treatment_post","control_pre", "control_post"))),
prePostCoords=prePostCoords)
rds@postCoords <- rds@prePostCoords[postElems,]
se <- SummarizedExperiment(assays = rep(list(matrix(nrow=length(rds@prePostCoords)/2, ncol=4)),2),
rowRanges=preCoords,
colData=DataFrame(row.names=c("treatment_pre","treatment_post","control_pre", "control_post"))
)
rds@countsTreatment <- SummarizedExperiment(assays = rep(list(matrix(nrow=length(rds@prePostCoords)/2, ncol=2)), length(mulRds@treatmentBams)),
rowRanges=preCoords,
colData=DataFrame(row.names=c("pre","post"))
)
rds@countsControl <- SummarizedExperiment(assays = rep(list(matrix(nrow=length(rds@prePostCoords)/2, ncol=2)), length(mulRds@controlBams)),
rowRanges=preCoords,
colData=DataFrame(row.names=c("pre", "post"))
)
for (i in 1:length(controlSE)) {
control <- assays(controlSE[[i]],1,withDimnames=TRUE)$counts[rownames(assays(controlSE[[i]],1,withDimnames=TRUE)$counts)==name]
assay(rds@countsControl,i,withDimnames=TRUE)[,"pre"] <- sapply(prePostDef, sumFragmentCounts, control, "pre")
assay(rds@countsControl,i,withDimnames=TRUE)[,"post"] <- sapply(prePostDef, sumFragmentCounts, control, "post")
}
for (i in 1:length(treatmentSE)) {
treatment <- assays(treatmentSE[[i]],1,withDimnames=TRUE)$counts[rownames(assays(treatmentSE[[i]],1,withDimnames=TRUE)$counts)==name]
assay(rds@countsTreatment,i,withDimnames=TRUE)[,"pre"] <- sapply(prePostDef, sumFragmentCounts, treatment, "pre")
assay(rds@countsTreatment,i,withDimnames=TRUE)[,"post"] <- sapply(prePostDef, sumFragmentCounts, treatment, "post")
}
rowRanges(rds) <- rowRanges(se)
colData(rds) <- colData(se)
assays(rds,withDimnames=TRUE) <- assays(se,withDimnames=TRUE)
names(assays(rds,withDimnames=TRUE)) <- "counts"
rds@step <- 1
rds@treatmentBams <- vector(mode = "list", length = length(mulRds@treatmentBams))
rds@controlBams <- vector(mode = "list", length = length(mulRds@controlBams))
rds@cores <- 1
return(rds)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.