R/package.R

Defines functions exonTrack bpTrack binTrack plotTest plotTrack .svdSelf normFactor .diffPatternTestExon2 .diffPatternTest2 diffPatternTestExon diffPatternTest .app2Exact .binning dataBinning .totalTransciptCoordinate .segCal .intersectionStrict .pMappingChr .pMappingAll .offsetFun .adjOff .tempOff .psiteOffset .createAnno .coordGene psiteMapping RiboDiPA

Documented in binTrack bpTrack dataBinning diffPatternTest diffPatternTestExon exonTrack normFactor plotTest plotTrack psiteMapping RiboDiPA

#### wrap function ####
RiboDiPA <- function(bam_file_list, gtf_file, classlabel, psite.mapping="auto",
    exon.binning=FALSE, bin.width=0, zero.omit=FALSE, bin.from.5UTR=TRUE,
    method=c("gtxr", "qvalue"), cores=NULL) {
    if (is.null(cores)) {
        cores <- detectCores(logical=FALSE)
    }
    data.psite <- psiteMapping(bam_file_list=bam_file_list, gtf_file=gtf_file,
        psite.mapping=psite.mapping, cores=cores)
    ifelse(exon.binning, {
        result <- diffPatternTestExon(psitemap=data.psite, 
            classlabel=classlabel, method=method)
        }, {
        data.binned <- dataBinning(data=data.psite$coverage,
            bin.width=bin.width, zero.omit=zero.omit,
            bin.from.5UTR=bin.from.5UTR, cores=cores)
        result <- diffPatternTest(data=data.binned, classlabel=classlabel,
            method=method)
    })
    return(c(result, data.psite))
}

#### p-site mapping on the merged exons ####

psiteMapping <- function(bam_file_list, gtf_file, psite.mapping="auto",
    cores=NULL) {
    options(warn=-1)
    # Get names of samples from BAM file list, removing file paths and suffixes
    names.sample <- sub("(.*\\/)([^.]+)(\\.[[:alnum:]]+$)", "\\2", 
        bam_file_list)
    names.sample <- sub(".bam", "", names.sample)
    # Parse gtf file to create GRangesList object
    txdb <- GenomicFeatures::makeTxDbFromGFF(gtf_file)
    all_exons <- GenomicFeatures::exons(txdb, 
        columns=c("EXONNAME","TXNAME","GENEID"), use.names = TRUE)
    all_genes <- split(all_exons, unlist(values(all_exons)$GENEID))
    # exclude genes on two strands
    all_genes <- all_genes[vapply(runValue(strand(all_genes)), length,
        integer(1)) == 1]
    # exclude genes on multiple chromosomes
    all_genes <- all_genes[vapply(runValue(seqnames(all_genes)), length,
        integer(1)) == 1]
    message("Computing total transcript coordinate for exons ...")
    exons.coord <- lapply(all_genes, .totalTransciptCoordinate)
    message("done\n")
    # merge overlapping exons in total transcript
    all_genes <- IRanges::reduce(all_genes)
    # p-site mapping
    results_all <- .pMappingAll(bam_file_list=bam_file_list,
        psite.mapping=psite.mapping, txdb=txdb, all_genes=all_genes,
        cores=cores)
    colnames(results_all$counts) <- names.sample
    # assign genomic coordinates as names of sites
    results_all$coverage <- results_all$coverage[names(exons.coord)] ###########
    tmp <- mapply(.coordGene, results_all$coverage, exons.coord)
    results_all$coverage <- tmp
    results_all$exons <- exons.coord
    return(results_all)
}

.coordGene <- function(coverage.gene,coord.gene){
    strands <- coord.gene[,"strand"][1]
    if(strands == "+"){
        coord.tmp <- IRanges(coord.gene[,"start_genome"],
            coord.gene[,"end_genome"])
        coord.tmp <- BiocGenerics::union(coord.tmp, coord.tmp)
        result <- NULL
        for(i in seq_len(length(coord.tmp))){
            result <- c(result,seq(from=start(coord.tmp)[i], 
                to=end(coord.tmp)[i]))
        }
        colnames(coverage.gene) <- result
    }else{
        coord.tmp <- IRanges(coord.gene[,"end_genome"],
            coord.gene[,"start_genome"])
        coord.tmp <- BiocGenerics::union(coord.tmp, coord.tmp)
        result <- NULL
        for(i in seq_len(length(coord.tmp))){
            result <- c(result,seq(from=start(coord.tmp)[i], 
                to=end(coord.tmp)[i]))
        }
        colnames(coverage.gene) <- rev(result)
    }
    return(coverage.gene)
}

.createAnno <- function(txdb) {
    group_name <- NULL
    exon <- suppressWarnings(GenomicFeatures::exonsBy(txdb,
        by="tx", use.names=TRUE))
    utr5 <- suppressWarnings(GenomicFeatures::fiveUTRsByTranscript(txdb,
        use.names=TRUE))
    cds <- suppressWarnings(GenomicFeatures::cdsBy(txdb, by="tx",
        use.names=TRUE))
    utr3 <- suppressWarnings(GenomicFeatures::threeUTRsByTranscript(txdb,
        use.names=TRUE))

    exon <- data.table::as.data.table(exon[unique(names(exon))])
    utr5 <- data.table::as.data.table(utr5[unique(names(utr5))])
    cds <- data.table::as.data.table(cds[unique(names(cds))])
    utr3 <- data.table::as.data.table(utr3[unique(names(utr3))])

    anno_df <- exon[, list(l_tr=sum(width)), by=list(transcript=group_name)]
    l_utr5 <- utr5[, list(l_utr5=sum(width)), by=list(transcript=group_name)]
    l_cds <- cds[, list(l_cds=sum(width)), by=list(transcript=group_name)]
    l_utr3 <- utr3[, list(l_utr3=sum(width)), by=list(transcript=group_name)]

    merge_allx <- function(x, y) merge(x, y, all.x=TRUE)
    anno_df <- Reduce(merge_allx, list(anno_df, l_utr5, l_cds, l_utr3))
    # standardGeneric for 'Reduce' defined from package 'BiocGenerics'
    anno_df[is.na(anno_df)] <- 0

    return(anno_df)
}

.psiteOffset <- function(bam_file_list, txdb) {
    cds_start <- cds_stop <- end3 <- end5 <- i.l_cds <- offset <- NULL
    i.l_utr5 <-  site_dist_end3 <- site_dist_end5 <- transcript <- NULL
    # prepare annotation
    annotation <- .createAnno(txdb)
    all_tx <- GenomicFeatures::exonsBy(txdb, by="tx", use.names=TRUE)
    name.tx <- names(unlist(all_tx))
    all_tmp <- data.table::as.data.table(all_tx)
    all_tmp[, `:=`(start, start - 30)][, `:=`(end, end + 30)][, `:=`(width,
        width + 60)]
    all_tmp <- GRanges(all_tmp)
    names(all_tmp) <- name.tx

    site_sub <- NULL
    for (bam in bam_file_list) {
        data <- GenomicAlignments::readGAlignments(bam)
        qwidth.data <- qwidth(data)
        data <- GenomicRanges::granges(data)
        data <- as.data.table(GenomicFeatures::mapToTranscripts(data, all_tmp))
        data[, `:=`(start, start - 30)][, `:=`(end, end - 30)]
        data$width <- qwidth.data[data$xHits]

        data <- data[, c("seqnames", "start", "end", "width"), with=FALSE]
        setnames(data, c("transcript", "end5", "end3", "length"))

        nreads <- nrow(data)
        data <- data[as.character(transcript) %in%
            as.character(annotation$transcript)]
        data[annotation, on="transcript", `:=`(c("cds_start", "cds_stop"),
            list(i.l_utr5 + 1, i.l_utr5 + i.l_cds))]
        data[cds_start == 1 & cds_stop == 0, `:=`(cds_start, 0)]

        lev <- sort(unique(data$length))
        data[, `:=`(site_dist_end5, end5 - cds_start)]
        data[, `:=`(site_dist_end3, end3 - cds_start)]
        site_sub <- rbind(site_sub, data[site_dist_end5 <= -6 &
            site_dist_end3 >= 5])
    }
    site_sub <- site_sub[length == end3 - end5 + 1]
    offsets <- .offsetFun(site_sub=site_sub, lev=lev)
    return(offsets)
}

.tempOff <- function(v_dist) {
    ttable <- sort(table(v_dist), decreasing=TRUE)
    ttable_sr <- ttable[as.character(as.numeric(names(ttable)) + 1)]
    ttable_sl <- ttable[as.character(as.numeric(names(ttable)) - 1)]
    tsel <- rowSums(cbind(ttable > ttable_sr, ttable > ttable_sl), na.rm=TRUE)
    return(as.numeric(names(tsel[tsel == 2][1])))
}

.adjOff <- function(dtsite, bestoff) {
    t <- table(factor(dtsite, levels=seq(min(dtsite) - 2, max(dtsite) + 1)))
    t[c(1, 2)] <- t[3] + 1
    locmax <- as.numeric(as.character(names(t[which(diff(sign(diff(t))) ==
        -2)]))) + 1
    adjoff <- locmax[which.min(abs(locmax - bestoff))]
    ifelse(length(adjoff) != 0, adjoff, bestoff)
}

.offsetFun <- function(site_sub, lev) {
    onsite <- count <- .SD <- site_dist_end5 <- NULL
    minlen <- min(site_sub$length)
    maxlen <- max(site_sub$length)
    t <- table(factor(site_sub$length, levels=lev))
    # offset
    offsets <- data.table(length=as.numeric(as.character(names(t))),
        count=as.vector(t))
    offsets[, `:=`(onsite, TRUE)][count == 0, `:=`(onsite, FALSE)]
    offset5 <- site_sub[, list(offset=.tempOff(.SD$site_dist_end5)), by=length]
    offsets.tmp <- merge(offsets, offset5, all.x=TRUE, by="length")
    best.offset <- as.numeric(offsets.tmp[!is.na(offset),
        list(count=sum(count)), by=offset][count == max(count)][, offset])
    # adjusted offset
    adj.tab <- site_sub[, list(offset=.adjOff(site_dist_end5, best.offset)),
        by=length]
    offsets <- merge(offsets, adj.tab, all.x=TRUE, by="length")
    offsets[is.na(offset), `:=`(offset, best.offset)][,
        `:=`(offset, abs(offset))][, `:=`(count, NULL)][, `:=`(onsite, NULL)]
    setnames(offsets, c("qwidth", "psite"))
}

.pMappingAll <- function(bam_file_list, psite.mapping, txdb, all_genes, cores){
    chrom <- seqnames(seqinfo(all_genes))  # extract all chromosome names
    for (k in seq_along(bam_file_list)) {
        if (!file.exists(gsub(".bam", ".bai", bam_file_list[k]))) {
            message("Indexing", bam_file_list[k], "\n")
            Rsamtools::indexBam(bam_file_list[k])
        }
    }
    if (psite.mapping[1] == "auto") {
        message("Computing P-site offsets \n")
        psite.mapping <- .psiteOffset(bam_file_list=bam_file_list, txdb=txdb)
    }
    if (is.null(cores)) {
        cores <- detectCores(logical=FALSE)
    }
    for (k in seq_along(bam_file_list)) {
        message("Computing coverage for", bam_file_list[k], "...")
        cl <- parallel::makeCluster(cores - 1)
        if (cores > 1) {
            registerDoParallel(cl)
        }
        i <- NULL
        results <- foreach(i=seq_along(chrom), .export=c(".pMappingChr",
        ".intersectionStrict", "psiteCal", ".createAnno", ".psiteOffset"),
        .packages=c("Rsamtools", "GenomicAlignments", "GenomicFeatures",
        "data.table", "Rcpp"), .multicombine=TRUE) %dopar%
            {
                return(.pMappingChr(bam=bam_file_list[k], all_genes=all_genes,
                    ch=chrom[i], psite.mapping=psite.mapping))
            }
        stopImplicitCluster()
        stopCluster(cl)
        read_counts <- do.call(rbind, lapply(results, `[[`, 2)) # separate read
        coverage_RLE <- lapply(results, `[[`, 1)  # separate coverage
        coverage_RLE <- coverage_RLE[lengths(coverage_RLE) > 0] # Rle to vector
        coverage_RLE <- do.call("c", coverage_RLE)
        coverage_vector <- lapply(coverage_RLE, as.vector)
        ifelse(k == 1, {
            coverage_matrix <- coverage_vector
            read_counts_matrix <- read_counts
        }, {
            coverage_matrix <- mapply(rbind, coverage_matrix, coverage_vector)
            read_counts_matrix <- cbind(read_counts_matrix, read_counts)
        })
        message("done! \n")
    }
    return(list(coverage=coverage_matrix, counts=read_counts_matrix,
        psite.mapping=psite.mapping))
}

.pMappingChr <- function(bam, all_genes, ch, psite.mapping) {
    ## function to calculate the coverage score and return in Rle objects
    psite <- center <- NULL
    bf <- BamFile(bam)
    tx_compat_cvg <- RleList()
    read_counts <- data.frame()
    genes <- all_genes[vapply(runValue(seqnames(all_genes)),
        function(x) as.character(unique(x)), character(1)) == ch]
    if (!length(genes)) return(list(tx_compat_cvg, read_counts))
    gr <- as(seqinfo(bf), "GRanges")
    if (ch %in% runValue(seqnames(gr))) {
        param <- ScanBamParam(which=gr[ch],
            flag=scanBamFlag(isSecondaryAlignment=FALSE))
        gal2 <- readGAlignments(bf, param=param)
        gal2 <- gal2[strand(gal2) != "*"]
        gal2 <- gal2[is.na(seqnames(gal2)) == FALSE]
        if (length(gal2)) {
            gal3 <- as.data.table(gal2)
            ifelse(all(psite.mapping[1] == "center"),
                gal3[, `:=`(psite, floor(qwidth/2))],
                gal3 <- merge.data.table(gal3, psite.mapping, by="qwidth"))
            gal3[strand == "-", `:=`(psite, qwidth - psite - 1)][, `:=`(center,
                psiteCal(cigar, start, psite))]
            gal3 <- gal3[center > 0, ]
            gal3[, `:=`(qwidth, 1)][, `:=`(width, 1)][, `:=`(njunc, 0)][,
                `:=`(cigar, "1M")]
            gal3[, `:=`(start, center)][, `:=`(end, center)][,
                `:=`(psite, NULL)][, `:=`(center, NULL)]
            gal3 <- makeGRangesFromDataFrame(gal3)
            gal3 <- as(gal3, "GAlignments")
        } else {
            gal3 <- gal2
        }
        results <- .intersectionStrict(genes, gal3)
        gal3 <- gal3[queryHits(results)]
        results <- .intersectionStrict(genes, gal3)
        tx2reads <- setNames(as(t(results), "List"), names(genes))
        compat_reads_by_tx <- extractList(gal3, tx2reads)
        read_counts <- data.frame(lengths(tx2reads))
    } else {
        tx2reads1 <- IntegerList(vector("list", length(genes)))
        names(tx2reads1) <- names(genes)
        compat_reads_by_tx <- extractList(gr, tx2reads1)
        read_counts <- data.frame(lengths(tx2reads1))
    }
    tx_compat_cvg <- pcoverageByTranscript(compat_reads_by_tx, genes,
        ignore.strand=FALSE)
    return(list(tx_compat_cvg, read_counts))
}

.intersectionStrict <- function(features, reads, ignore.strand=FALSE,
    inter.feature=TRUE){
    ## function adapted to compile compatible reads
    ov <- findOverlaps(reads, features, type="within",
        ignore.strand=ignore.strand)
    if (inter.feature) {
        reads_to_keep <- which(countQueryHits(ov) == 1L)
        ov <- ov[queryHits(ov) %in% reads_to_keep]
    }
    return(ov)
}

.segCal <- function(x, ints) {
    l <- sum(x >= ints[, 1])
    cumints <- c(0, cumsum(ints[, 2]))
    return(cumints[l] + x - ints[l, 1] + 1)
}

.totalTransciptCoordinate <- function(x) {
    range.gene <- cbind(start(x), end(x))
    strands <- as.character(strand(x))[1]
    starts <- start(x)
    ends <- end(x)
    if(strands=="+"){
        range.gene2 <- IRanges(starts, ends)
        range.gene2 <- as.matrix(BiocGenerics::union(range.gene2, range.gene2))
        range.relative <- matrix(vapply(range.gene,
            function(x) .segCal(x, range.gene2), numeric(1)), ncol=2)
        range.relative <- cbind(range.relative, range.gene)
        rownames(range.relative) <- unlist(x$EXONNAME) 
    }else{
        m <- max(start(x),end(x))+1
        range.gene1 <- cbind(rev(m-ends),rev(m-starts))
        range.gene2 <- IRanges(range.gene1[,1],range.gene1[,2])
        range.gene2 <- as.matrix(BiocGenerics::union(range.gene2, range.gene2))
        range.relative <- matrix(vapply(range.gene1,
            function(x) .segCal(x, range.gene2), numeric(1)), ncol=2)
        range.relative <- cbind(range.relative,
            rev(range.gene[,2]), rev(range.gene[,1]))
        rownames(range.relative) <- rev(unlist(x$EXONNAME))
    }
    colnames(range.relative) <- c("start_tranx", "end_tranx","start_genome",
        "end_genome")
    range.relative <- data.frame(range.relative) ###########
    range.relative$seqnames <- as.character(seqnames(x))
    range.relative$strand <- strands
    range.relative$txname <- paste(sapply(x$TXNAME, paste, collapse=", ")) ###########
    return(range.relative)
}

#### data binning ####

dataBinning <- function(data, bin.width=0, zero.omit=FALSE, bin.from.5UTR=TRUE,
    cores=NULL) {
    options(warn=-1)
    if (is.null(cores)) cores <- detectCores(logical=FALSE)
    cl <- makeCluster(cores - 1)
    if (cores > 1) registerDoParallel(cl)
    genes.list <- names(data)
    gene <- NULL
    message("Data binning ...")
    DATA.BIN <- foreach(gene=genes.list, .final=function(x) setNames(x,
        genes.list), .packages="reldist", 
        .export=c(".app2Exact",".binning"))  %dopar% {
            data1 <- data[[gene]]
            .binning(data1, bin.width, zero.omit, bin.from.5UTR)
    }
    stopImplicitCluster()
    stopCluster(cl)
    message("done! \n")
    return(DATA.BIN)
}

.binning <- function(data1, bin.width, zero.omit, bin.from.5UTR){
    n <- nrow(data1)
    p <- ncol(data1)
    p.codon <- ceiling(p/3)  # merge every 3 nt into a codon
    ifelse(bin.from.5UTR, bin.codon <- (seq_len(p) - 1)%/%3,
        bin.codon <- rev((rev(seq_len(p)) - 1)%/%3))
    data.codon <- t(apply(data1, 1, function(x) 
        unname(tapply(x, bin.codon, sum))))
    names.codon <- unname(tapply(colnames(data1), bin.codon, function(x)
        paste(x[1],tail(x,n=1), sep="-")))
    colnames(data.codon) <- names.codon
    if (bin.width == 1) return(data.codon) # return codon level
    if (sum(data1)) {
        if (bin.width) {
            p.bin <- ceiling(p.codon/bin.width)  # fixed width
        } else {
            n.data <- mean(rowSums(data.codon))
            iqr.data <- abs(wtd.iqr(x=seq_len(p.codon),
                weight=colMeans(data.codon)))
            n.bin <- ceiling(p.codon/(2 * iqr.data/(n.data^(1/3))))
            p.bin <- min(p.codon, n.bin)  # adaptive width
        }
        bin.size <- .app2Exact(p.codon, rep(1/p.bin, p.bin))
        if (!bin.from.5UTR) bin.size <- rev(bin.size)
        Bin.size <- c(0, cumsum(bin.size))
        data.bin <- do.call("cbind",lapply(split(seq_len(p.codon),
            cut(seq_len(p.codon), Bin.size)), function(x) {
            rowSums(data.codon[, x, drop=FALSE])}))
        bin.size2 <- do.call(c,lapply(split(unname(table((
            seq_len(p) - 1)%/%3)),cut(seq_len(p.codon), Bin.size)),sum))
        Bin.size2 <- c(0, cumsum(unname(bin.size2)))
        names.bin <- lapply(split(seq_len(p),cut(seq_len(p), Bin.size2)),
            function(x) paste(colnames(data1)[x[1]],
            colnames(data1)[tail(x,n=1)], sep="-"))
        colnames(data.bin) <- unname(names.bin)
    } else {
        data.bin <- matrix(0, nrow=n, ncol=1) # no reads
        colnames(data.bin) <- paste(colnames(data1)[1],
            tail(colnames(data1),n=1), sep="-")
    }
    if (zero.omit) data.bin <- data.bin[,colSums(data.bin) > 0, drop=FALSE]
    if (is.vector(data.bin)){
        data.bin <- matrix(data.bin, ncol=1) 
        colnames(data.bin) <- paste(colnames(data1)[1],
            tail(colnames(data1),n=1), sep="-")
    } 
    return(data.bin)
}
    
.app2Exact <- function(n, p) {
    ## p=p[1:m] is an allocation, p[i] >=0, sum(p)=1
    ans <- floor(n * p)
    k <- n - sum(ans)
    if (k > 0) {
        stemp <- n * p - ans
        otemp <- order(-stemp)[seq_len(k)]
        ans[otemp] <- ans[otemp] + 1
    }
    return(ans)
}

#### main function ####
diffPatternTest <- function(data, classlabel, method=c("gtxr", "qvalue")) {
    options(warn=-1)

    ## prepare data
    genes.list <- names(data)
    ## remove other replicates not involved in test
    data <- lapply(data, "[", which(classlabel$comparison != 0), )
    ## remove genes with only a vector
    noread <- which(do.call("c", lapply(data, is.vector)))
    genes.list <- genes.list[which(do.call("c", lapply(data, is.matrix)))]
    data <- data[genes.list]
    genes.list <- genes.list[which(apply(do.call("rbind",
        lapply(data, rowSums)), 1, min) > 0)]
    ## remove genes with no read
    noread2 <- which(apply(do.call("rbind", lapply(data, rowSums)),1,min) == 0)
    data <- data[genes.list]

    result.dp <- .diffPatternTest2(data=data, classlabel=classlabel[classlabel$
        comparison != 0, ], method=method)
    result.dp$small <- names(c(noread, noread2))
    result.dp$data <- data
    result.dp$classlabel <- classlabel
    return(result.dp)
}

diffPatternTestExon <- function(psitemap, classlabel, method=c("gtxr", 
    "qvalue")) {
    options(warn=-1)

    ## prepare data
    data <- psitemap$coverage
    genes.list <- names(data)
    ## remove other replicates not involved in test
    data <- lapply(data, "[", which(classlabel$comparison != 0), )
    genes.list <- genes.list[which(apply(do.call("rbind",
        lapply(data, rowSums)), 1, min) > 0)]
    ## remove genes with no read
    noread <- which(apply(do.call("rbind", lapply(data, rowSums)),1,min) == 0)
    data <- data[genes.list]
    sgtf <- psitemap$exons[genes.list]

    result.dp <- .diffPatternTestExon2(sgtf=sgtf, data=data, classlabel=
        classlabel[classlabel$comparison != 0, ], method=method)
    result.dp$small <- names(noread)
    result.dp$data <- data
    result.dp$classlabel <- classlabel
    tmp <- mapply(base::cbind,result.dp$bin,sgtf, SIMPLIFY = FALSE)
    result.dp$bin <- tmp
    return(result.dp)
}

.diffPatternTest2 <- function(data, classlabel, method) {
    genes.list <- names(data)
    condition <- classlabel$comparison

    tvalue <- 1 - do.call("c", lapply(data, function(x) {
        abs(sum(.svdSelf(x[which(condition == 1), ]) *
            .svdSelf(x[which(condition == 2), ])))
    }))

    message("Bin level testing ...\n")
    DATA.com <- t(do.call("cbind", data))  # link all bins
    classlabel$comparison <- as.factor(classlabel$comparison)
    colnames(DATA.com) <- classlabel$type
    dds <- DESeqDataSetFromMatrix(countData=DATA.com, colData=classlabel,
        design=~comparison)
    S.hat <- lapply(data, normFactor, condition=condition)  # size factor
    normFactors <- t(do.call("cbind", mapply(function(x, y)
        replicate(ncol(y), x), S.hat, data)))
    normalizationFactors(dds) <- normFactors  # assign size factors
    dds <- DESeq(dds)
    res <- as.matrix(results(dds))[, c("pvalue", "log2FoldChange")]

    message("done!\nSplitting test results by genes ...")
    LL <- unlist(lapply(data, ncol))
    res.bin <- apply(res, 2, function(x) split(x, rep(names(LL), LL)))
    res.bin <- mapply(cbind, res.bin$pvalue, res.bin$log2FoldChange)
    res.bin <- lapply(res.bin, function(x) {
        x <- cbind(x, padj=NA)
        colnames(x) <- c("pvalue", "log2FoldChange", method[1])
        x[which(!is.na(x[, "pvalue"])), method[1]] <-
            elitism::p.adjust(na.omit(x[, "pvalue"]), method=method[1])
        return(x)
    })
    pvalue.gene <- do.call("c", lapply(res.bin, function(x) min(x[, method[1]],
        na.rm=TRUE)))
    pvalue.gene <- sort(pvalue.gene) ###########
    genes.list <- names(pvalue.gene) ###########
    message("done!\nGenome wide family control ...")
    ifelse(method[2] == "qvalue", padj.gene <- qvalue(pvalue.gene)$qvalues,
        padj.gene <- elitism::p.adjust(pvalue.gene, method=method[2]))
    gene.result <- data.frame(tvalue=tvalue[genes.list],
        pvalue=pvalue.gene[genes.list], padj=padj.gene[genes.list])
    colnames(gene.result)[3] <- method[2]
    message("done! \n")
    return(list(bin=res.bin[genes.list], gene=gene.result, method=method))
}

.diffPatternTestExon2 <- function(sgtf, data, classlabel, method) {
    genes.list <- names(data)
    condition <- classlabel$comparison
    message("Exon binning ...\n")
    data.exon <- mapply(function(x, y) {
    apply(y, 1, function(z) rowSums2(x, cols=seq(z[1], z[2])))}, data, sgtf)
    tvalue <- 1 - do.call("c", lapply(data.exon, function(x) {
        ifelse(ncol(x) == 1, 1, abs(sum(.svdSelf(x[which(condition == 1), ]) *
            .svdSelf(x[which(condition == 2), ]))))
    }))
    tvalue <- pmax(0, tvalue)
    message("done!\nExon level testing ...\n")
    factor.exon <- lapply(data, function(x) {
        x.codon <- t(apply(x, 1, function(v) {
            unname(tapply(v, (seq_along(v) - 1)%/%3, sum))}))
        normFactor(x.codon, condition)
    })
    normFactors <- t(do.call("cbind", mapply(function(x, y)
        replicate(ncol(y), x), factor.exon, data.exon)))
    DATA.com <- t(do.call("cbind", data.exon))
    classlabel <- classlabel[classlabel$comparison != 0, ]
    classlabel$comparison <- as.factor(classlabel$comparison)
    colnames(DATA.com) <- classlabel$type
    dds <- DESeqDataSetFromMatrix(countData=DATA.com, colData=classlabel,
        design=~comparison)
    normalizationFactors(dds) <- normFactors  # assign size factors
    dds <- DESeq(dds)
    res <- as.matrix(results(dds))[, c("pvalue", "log2FoldChange")]
    message("Exon level testing done!\nSplitting test results by genes ...")
    LL <- unlist(lapply(data.exon, ncol))
    res.bin <- apply(res, 2, function(x) split(x, rep(names(LL), LL)))
    res.bin <- mapply(cbind, res.bin$pvalue, res.bin$log2FoldChange)
    res.bin <- lapply(res.bin, function(x) {
        x <- cbind(x, padj=NA)
        colnames(x) <- c("pvalue", "log2FoldChange", method[1])
        x[which(!is.na(x[, "pvalue"])), method[1]] <-
            elitism::p.adjust(na.omit(x[, "pvalue"]), method=method[1])
        return(x)
    })
    pvalue.gene <- do.call("c", lapply(res.bin, function(x)
        min(x[, method[1]], na.rm=TRUE)))
    pvalue.gene <- sort(pvalue.gene)
    genes.list <- names(pvalue.gene)
    ifelse(method[2] == "qvalue", padj.gene <- qvalue(pvalue.gene)$qvalues,
        padj.gene <- elitism::p.adjust(pvalue.gene,method=method[2]))
    gene.result <- data.frame(tvalue=tvalue[genes.list],
        pvalue=pvalue.gene[genes.list],padj=padj.gene[genes.list])
    colnames(gene.result)[3] <- method[2]
    message("done! \n")
    return(list(bin=res.bin[genes.list], gene=gene.result, method=method,
        exon=data.exon))
}

## abundance estimation function
normFactor <- function(x, condition) {
    s.tmp <- rowSums(x)/median(rowSums(x))
    if (all(s.tmp > 0)) {
        y1 <- colSums2(x, rows=(condition == 1))
        y2 <- colSums2(x, rows=(condition == 2))
        ind1 <- (y1 > 0) & (y2 > 0)
        if (sum(ind1) > 1) {
            x <- x[, ind1]
            y1 <- colSums2(x, rows=(condition == 1))
            y2 <- colSums2(x, rows=(condition == 2))
            x.total <- sum(x)
            tmp <- log(y1) - log(y2)
            tmp2 <- as.numeric(quantile(tmp, probs=c(0.25, 0.75), na.rm=TRUE) +
                c(-1.5, 1.5) * IQR(tmp, na.rm=TRUE))
            ind2 <- tmp >= tmp2[1] & tmp <= tmp2[2]
            if (sum(ind2) > 1) {
                x <- x[, ind2]
                if (sum(x)/x.total > 0.1 & all(rowSums(x) > 0)) {
                    s.tmp <- rowSums(x)/median(rowSums(x))
                }
            }
        }
    }
    s.tmp[s.tmp == 0] <- 1e-08
    s.tmp
}

.svdSelf <- function(x) {
    ifelse(is.vector(x), result <- x/sqrt(sum(x^2)), result <- svd(x)$v[, 1])
    return(result)
}


#### plotting ####

plotTrack <- function(data, genes.list, replicates=NULL, exons=FALSE) {
    nt <- RPF <- NULL
    options(warn=-1)
    if (is.null(replicates)) {
        replicates <- seq_len(ncol(data$counts))
    }
    gplot <- NULL
    for (gene in genes.list) {
        gplot[[gene]] <- NULL
        data0 <- data$coverage[[gene]]
        nn <- nrow(data$exons[[gene]])
        x.tick <- c(data$exons[[gene]][,1],data$exons[[gene]][nn,2])
        x.label <- c(data$exons[[gene]][,3],data$exons[[gene]][nn,4])
        track0 <- NULL
        for (i in replicates) {
            rep.i <- data.frame(nt=seq_len(ncol(data0)), RPF=data0[i, ],
                                replicate=colnames(data$counts)[i])
            track0 <- rbind(track0, rep.i)
        }
        gplot[[gene]][["nt"]] <- ggplot(track0, aes(x=nt, y=RPF)) +
            geom_bar(stat="identity") + theme_minimal() +
            facet_wrap(~replicate, ncol=1) +
            scale_x_continuous(breaks=x.tick, labels =x.label) +
            geom_vline(xintercept=x.tick, alpha=0.1)+
            ggtitle(paste("gene: ", gene, ", ", ncol(data0), " nt", sep="")) +
            theme(plot.title=element_text(hjust=0.5), 
                    axis.text.x = element_text(angle = 45, vjust = 1, 
                                            size = 8, hjust = 1))
        if (exons) {
            gplot[[gene]][["exon"]] <- NULL
            exons.gene <- data$exons[[gene]]
            for (j in seq_len(nrow(exons.gene))) {
                exon <- rownames(exons.gene)[j]
                track2 <- track0[track0$nt >= exons.gene[j, 1] & track0$nt <=
                                    exons.gene[j, 2], ]
                gplot[[gene]][["exon"]][[exon]] <- ggplot(track2,
                            aes(x=nt,y=RPF))+ geom_bar(stat="identity") +
                theme_minimal() + facet_wrap(~replicate,ncol=1) +
                scale_x_continuous(breaks = unname(unlist(exons.gene[j, 1:2])),
                            labels = unname(unlist(exons.gene[j, 3:4]))) +
                ggtitle(paste("gene: ", gene, ", exon: ", exon, ", ",
                        diff(unlist(exons.gene[j,1:2])) + 1, " nt", sep="")) +
                    theme(plot.title=element_text(hjust=0.5))
            }
        }
    }
    return(gplot)
}

plotTest <- function(result, genes.list=NULL, threshold=0.05) {
    bin <- RPF <- condition <- NULL
    method <- result$method
    classlabel <- result$classlabel[result$classlabel$comparison != 0, ]
    if (is.null(genes.list)) {
        genes.list <- rownames(result$gene[which(result$gene[, method[2]] <=
                                                threshold),])
    }
    gplot <- NULL
    for (gene in genes.list) {
        data1 <- result$data[[gene]]
        tmp <- result$bin[[gene]][, method[1]]
        diff.spot <- which(tmp <= threshold)
        track1 <- data.frame()
        for (i in seq_len(nrow(classlabel))) {
            rep.i <- data.frame(bin=seq_len(ncol(data1)), RPF=data1[i, ],
                                replicate=rownames(classlabel)[i],
                                condition=classlabel$condition[i])
            rep.i$condition <- as.character(rep.i$condition)
            rep.i[diff.spot, "condition"] <- "DP spot"
            track1 <- rbind(track1, rep.i)
        }
        rownames(track1) <- NULL
        # name.track <- rownames(result$bin[[gene]])
        track1$replicate <- as.factor(track1$replicate)
        track1$replicate <- factor(track1$replicate,
                    levels(track1$replicate)[order(classlabel$comparison)])
        gplot[[gene]] <- ggplot(track1, aes(x=bin, y=RPF, fill=condition)) +
            geom_bar(stat="identity") + theme_minimal() +facet_wrap(~replicate,
                            ncol=1) + scale_x_continuous(breaks = diff.spot,
                labels = formatC(tmp[diff.spot], format = "e", digits = 2)) +
            ggtitle(paste(gene,", ", method[2], " =", formatC(result$gene[gene,
                method[2]], format="e", digits=2), ", T-value =",
                formatC(result$gene[gene, "tvalue"], format="e", digits=2),
                        ", ", ncol(data1), " bins", sep="")) +
            theme(plot.title=element_text(hjust=0.5), 
                    axis.text.x = element_text(size = 8, 
                                        angle = 90, vjust = 0.5, hjust=1))
    }
    return(gplot)
}

#### track output for genome browser ####
binTrack <- function(data, exon.anno){
    classlabel <- data$classlabel
    track.bin <- vector(mode="list", length=nrow(classlabel))
    names(track.bin) <- rownames(classlabel)
    for(n.rep in seq_len(nrow(classlabel))){
        track <- rownames(classlabel)[n.rep]
        result.track <- GRanges(seqnames = NULL, ranges = NULL,
            score = NULL, genes = NULL, tracks = NULL, bin = NULL)
        for(gene in names(data$data)){
            data.bin <- data$data[[gene]]
            bin.range <- colnames(data.bin)
            range.bin <- do.call(rbind, do.call(c, lapply(bin.range,
                strsplit, split = '-')))
            chr <- exon.anno[[gene]]$seqnames[1]
            strand <- exon.anno[[gene]]$strand[1]
            ifelse(strand=="+", {
                starts <- as.integer(range.bin[,1])
                ends <- as.integer(range.bin[,2])
                exon.tmp <- data.table(start = exon.anno[[gene]][,3],
                    end = exon.anno[[gene]][,4])
            },{
                ends <- as.integer(range.bin[,1])
                starts <- as.integer(range.bin[,2])
                exon.tmp <- data.table(start = exon.anno[[gene]][,4],
                    end = exon.anno[[gene]][,3])
            })
            track.tmp <- data.table(start = starts, end = ends,
                score = data$data[[gene]][n.rep,],
                bin = seq_len(nrow(range.bin)),
                padj = data$bin[[gene]][,3])
            setkey(exon.tmp, start, end)
            overlap.bin <- as.matrix(data.table::foverlaps(track.tmp,
                exon.tmp, nomatch = 0L))
            
            track.gene <- GRanges(seqnames = chr, 
                ranges=IRanges(start = overlap.bin[,3] - 1, 
                end = overlap.bin[,4]), strand = strand,
                score = overlap.bin[,5], genes = gene, tracks = track, 
                bin = as.character(overlap.bin[,6]), padj = overlap.bin[,7])
            exon.int <- GRanges(seqnames = chr, ranges = 
                IRanges(start = overlap.bin[,1] - 1, end = overlap.bin[,2]))
            track.gene <- pintersect(track.gene, exon.int)[,1:5]
            result.track <- c(result.track, track.gene)
        }
        track.bin[[track]] <- result.track
    }
    return(track.bin)
}

bpTrack <- function(data, names.rep = NULL, genes.list = NULL){
    if(is.null(names.rep)){
        names.rep <- colnames(data$counts)
    }
    if(is.null(genes.list)){
        genes.list <- names(data$coverage)
    }
    track.bp <- vector(mode = "list", length = length(names.rep))
    names(track.bp) <- names.rep
    for(n.rep in seq_len(length(names.rep))){
        track <- names.rep[n.rep]
        result.track <- GRanges(seqnames = NULL, ranges = NULL,
            coverahe = NULL, genes = NULL)
        for(gene in genes.list){
            chr <- data$exons[[gene]]$seqnames[1]
            strand <- data$exons[[gene]]$strand[1]
            sites <- as.numeric(colnames(data$coverage[[gene]]))
            coverages <- data$coverage[[gene]][n.rep,]
            sites <- sites[which(coverages>0)]
            coverages <- coverages[which(coverages>0)]
            if(length(sites)){
                track.gene <- GRanges(seqnames = chr, 
                    ranges = IRanges(start = sites - 1, end = sites),
                    strand = strand, coverage = coverages)
                result.track <- c(result.track, track.gene)
            }
        }
        track.bp[[track]] <- result.track
    }
    return(track.bp)
}

exonTrack <- function(data, gene){
    classlabel <- data$classlabel
    track.bin <- vector(mode = "list", length = nrow(classlabel))
    names(track.bin) <- rownames(classlabel)
    data.gene <- data$bin[[gene]]
    txnames <- unique(data.gene[,"txname"])
    strands <- data.gene[1, "strand"]
    chr <- data.gene[1, "seqnames"]
    for(n.rep in seq_len(nrow(classlabel))){
        name.track <- rownames(classlabel)[n.rep]
        track.bin[[name.track]] <- vector(mode = "list",
            length = length(txnames))
        names(track.bin[[name.track]]) <- txnames
        for(tx in txnames){
            data.tx <- data.gene[data.gene[,"txname"] == tx,]
            ifelse(strands == "+", {
                starts <- data.tx[,"start_genome"]
                ends <- data.tx[,"end_genome"]
            },{
                ends <- data.tx[,"start_genome"]
                starts <- data.tx[,"end_genome"]
            })
            track.tx <- GRanges(seqnames = chr, 
                ranges = IRanges(start = starts - 1, end = ends - 1),
                strand = strands,
                score = data$exon[[gene]][n.rep, rownames(data.tx)], 
                genes = gene, tracks = name.track, 
                exon = rownames(data.tx))
            track.bin[[name.track]][[tx]] <- track.tx
        }
    }
    return(track.bin)
}
jipingw/RiboDiPA documentation built on June 25, 2022, 4:47 p.m.