R/utr3UsageEstimation.R
In InPAS: InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

Documented in utr3UsageEstimation

coverageCDS <- function(coverage, CDS, genome, gp1, gp2, hugeData){
    totalCov <- totalCoverage(coverage, genome, hugeData, groupList=list(gp1=gp1, gp2=gp2))
    cdsTotalCov <- UTR3TotalCoverage(CDS, totalCov)
    cdsTotalCov <- lapply(cdsTotalCov, function(.ele){
        lapply(.ele, function(.e){
            .e <- colSums(.e)/nrow(.e)
        })
    })
    cdsTotalCov <- do.call(rbind, lapply(cdsTotalCov, do.call, what=rbind))
    if(!hugeData){
        gp1 <- cdsTotalCov[, gp1, drop=FALSE]
        gp2 <- cdsTotalCov[, gp2, drop=FALSE]
        gp1 <- rowMeans(gp1)
        gp2 <- rowMeans(gp2)
        cdsTotalCov <- cbind(gp1=gp1, gp2=gp2)
    }
    cdsTotalCov
}

utr3UsageEstimation <- function(CPsites, coverage, genome, utr3, 
                                gp1, gp2=NULL, 
                                short_coverage_threshold=10, 
                                long_coverage_threshold=2, 
                                adjusted.P_val.cutoff=0.05, 
                                dPDUI_cutoff=0.3, 
                                PDUI_logFC_cutoff=0.59, 
                                BPPARAM=NULL){
    if(!all(c(gp1,gp2) %in% names(coverage))) 
        stop("gp1 and gp2 must be in names of coverage")
    if(missing(coverage) || missing(CPsites))
        stop("CPsites and coverage is required.")
    if(length(gp2)>0){
        if(missing(utr3) || missing(genome)){
            stop("utr3 and genome is required.")
        }
        if(!is(genome, "BSgenome"))
            stop("genome must be an object of BSgenome.")
        if(!is(utr3, "GRanges") | 
               !all(utr3$feature %in% c("utr3", "next.exon.gap", "CDS"))){
            stop("utr3 must be output of function of utr3Annotation")
        }
    }
    hugeData <- is.character(coverage[[1]])
    if(length(c(gp1, gp2))==1){
        coverage <- coverage[c(gp1, gp2)]
        UTRusage <- UTR3usage(CPsites, coverage, hugeData, BPPARAM, phmm=TRUE)
    }else{
        depth.weight <- depthWeight(coverage, hugeData)
        UTRusage <- UTR3usage(CPsites, coverage, hugeData, BPPARAM)
    }
    ##step3, calculate mean for each group
    UTRusage <- split(UTRusage, UTRusage$transcript)
    UTRusage <- UTRusage[sapply(UTRusage, length)>=2]
    UTRusage <- unlist(UTRusage, use.names = FALSE)
    UTRusage.short <- UTRusage[UTRusage$source=="short"]
    UTRusage.long <- UTRusage[UTRusage$source=="long"]
    UTRusage.short <- UTRusage.short[!duplicated(UTRusage.short$transcript)]
    UTRusage.long <- UTRusage.long[match(UTRusage.short$transcript, 
                                         UTRusage.long$transcript)]
    PDUItable <- UTRusage.short
    PDUItable$data <- NULL
    start(PDUItable)[as.character(strand(PDUItable))=="-"] <- 
        PDUItable$Predicted_Distal_APA[as.character(strand(PDUItable))=="-"]
    end(PDUItable)[as.character(strand(PDUItable))=="+"] <- 
        PDUItable$Predicted_Distal_APA[as.character(strand(PDUItable))=="+"]
    
    UTRusage.short.data <- do.call(rbind, UTRusage.short$data)
    UTRusage.long.data <- do.call(rbind, UTRusage.long$data)
    PDUItable$total.gp1 <- UTRusage.short.data[, gp1]
    PDUItable$long.gp1 <- UTRusage.long.data[, gp1]
    PDUItable$short.gp1 <- PDUItable$total.gp1 - PDUItable$long.gp1
    PDUItable$short.gp1[PDUItable$short.gp1<0] <- 0
    
    if(length(c(gp1, gp2))==1){
        PDUItable$data2 <- NULL
        PDUItable$test_status <- PDUItable$total.gp1>long_coverage_threshold & 
            PDUItable$long.gp1>long_coverage_threshold
        ##test by phmm ##phmm not ready, simple as t.test
        pval <- mapply(function(a, b){
            if(length(a)>10) a <- tapply(a, cut(1:length(a), 10), sum)
            if(length(b)>10) b <- tapply(b, cut(1:length(b), 10), sum)
            t.test(a, b)$p.value
        }, UTRusage.short$data2, UTRusage.long$data2)
        PDUItable$dPDUI <- PDUItable$PDUI.gp1 <- 
            PDUItable$long.gp1/(PDUItable$long.gp1 + PDUItable$short.gp1)
        PDUItable$logFC <- TRUE
    }else{
        if(is.character(gp2)){
            PDUItable$total.gp2 <- UTRusage.short.data[, gp2]
            PDUItable$long.gp2 <- UTRusage.long.data[, gp2]
            PDUItable$short.gp2 <- PDUItable$total.gp2 - PDUItable$long.gp2
            PDUItable$short.gp2[PDUItable$short.gp2<0] <- 0
        }
        if(length(gp1)>1){
            PDUItable$total.mean.gp1 <- PDUItable$total.gp1
            PDUItable$total.mean.gp1[
                PDUItable$total.gp1<short_coverage_threshold] <- NA
            PDUItable$total.mean.gp1 <- rowMeans(PDUItable$total.mean.gp1, 
                                                 na.rm=TRUE)
            PDUItable$long.mean.gp1 <- PDUItable$long.gp1
            PDUItable$long.mean.gp1[
                PDUItable$long.gp1<long_coverage_threshold] <- NA
            PDUItable$long.mean.gp1 <- rowMeans(PDUItable$long.mean.gp1, 
                                                na.rm=TRUE)
            PDUItable$short.mean.gp1 <- rowMeans(PDUItable$short.gp1)
        }else{
            PDUItable$total.mean.gp1 <- PDUItable$total.gp1
            PDUItable$total.mean.gp1[
                PDUItable$total.gp1<short_coverage_threshold] <- NA
            PDUItable$long.mean.gp1 <- PDUItable$long.gp1
            PDUItable$long.mean.gp1[
                PDUItable$long.gp1<long_coverage_threshold] <- NA
            PDUItable$short.mean.gp1 <- PDUItable$short.gp1
        }
        if(is.character(gp2)){
            if(length(gp2)>1){
                PDUItable$total.mean.gp2 <- PDUItable$total.gp2
                PDUItable$total.mean.gp2[
                    PDUItable$total.gp2<short_coverage_threshold] <- NA
                PDUItable$total.mean.gp2 <- rowMeans(PDUItable$total.mean.gp2, 
                                                     na.rm=TRUE)
                PDUItable$long.mean.gp2 <- PDUItable$long.gp2
                PDUItable$long.mean.gp2[
                    PDUItable$long.gp2<long_coverage_threshold] <- NA
                PDUItable$long.mean.gp2 <- rowMeans(PDUItable$long.mean.gp2, 
                                                    na.rm=TRUE)
                PDUItable$short.mean.gp2 <- rowMeans(PDUItable$short.gp2)
            }else{
                PDUItable$total.mean.gp2 <- PDUItable$total.gp2
                PDUItable$total.mean.gp2[
                    PDUItable$total.gp2<short_coverage_threshold] <- NA
                PDUItable$long.mean.gp2 <- PDUItable$long.gp2
                PDUItable$long.mean.gp2[
                    PDUItable$long.gp2<long_coverage_threshold] <- NA
                PDUItable$short.mean.gp2 <- PDUItable$short.gp2
            }
        }
        
        data <- as.data.frame(mcols(PDUItable))
        if(is.character(gp2)){
            ##compensation by last CDS
            ids <- data$short.mean.gp1<=0 & data$short.mean.gp2<=0
            if(sum(ids)>0){
                ##get coverage of last CDS
                CDS <- utr3[utr3$feature=="CDS"]
                CDS <- CDS[CDS$transcript %in% unique(CPsites$transcript)]
                cds.id <- match(data$transcript[ids], CDS$transcript)
                cds.id.na <- is.na(cds.id)
                CDS <- CDS[cds.id[!cds.id.na]]
                if(length(CDS)>0){
                    ##use last CDS coverage replace the short.mean.gp1 and gp2
                    CDS.cov <- coverageCDS(coverage, CDS, genome, 
                                           gp1, gp2, hugeData)
                    CDS.cov <- CDS.cov[match(names(CDS), rownames(CDS.cov)), , 
                                       drop=FALSE]
                    rownames(CDS.cov) <- names(CDS)
                    CDS.cov[is.na(CDS.cov)] <- 0
                    PDUItable$short.mean.gp1[ids][!cds.id.na] <- CDS.cov[,"gp1"] - 
                        PDUItable$long.mean.gp1[ids][!cds.id.na]
                    PDUItable$short.mean.gp2[ids][!cds.id.na] <- CDS.cov[,"gp2"] - 
                        PDUItable$long.mean.gp2[ids][!cds.id.na]
                    PDUItable$short.mean.gp1[
                        is.na(PDUItable$short.mean.gp1) | 
                            PDUItable$short.mean.gp1<0] <- 0
                    PDUItable$short.mean.gp2[
                        is.na(PDUItable$short.mean.gp2) | 
                            PDUItable$short.mean.gp2<0] <- 0
                }
            }
            
            ##compensation by ratio
            #data <- as.data.frame(mcols(PDUItable))
            #ids <- data$short.mean.gp1<=0 & data$short.mean.gp2<=0
            #ratio.gp1 <- data$long.mean.gp1/data$total.mean.gp1
            #ratio.gp2 <- data$long.mean.gp2/data$total.mean.gp2
            #ratio <- ifelse(ratio.gp1>ratio.gp2, ratio.gp1, ratio.gp2)
            #short.mean.gp1 <- data$total.mean.gp1 * ratio - data$long.mean.gp1
            #short.mean.gp2 <- data$total.mean.gp2 * ratio - data$long.mean.gp2
            #short.mean.gp1[short.mean.gp1<0] <- 0
            #short.mean.gp2[short.mean.gp2<0] <- 0
            #PDUItable$short.mean.gp1[ids] <- short.mean.gp1[ids]
            #PDUItable$short.mean.gp2[ids] <- short.mean.gp2[ids]
            PDUItable$total.mean.gp1[is.na(PDUItable$total.mean.gp1)] <- 0
            PDUItable$total.mean.gp2[is.na(PDUItable$total.mean.gp2)] <- 0
            PDUItable$long.mean.gp1[is.na(PDUItable$long.mean.gp1)] <- 0
            PDUItable$long.mean.gp2[is.na(PDUItable$long.mean.gp2)] <- 0
            PDUItable$short.mean.gp1[is.na(PDUItable$short.mean.gp1)] <- 0
            PDUItable$short.mean.gp2[is.na(PDUItable$short.mean.gp2)] <- 0
            PDUItable$test_status <- 
                (PDUItable$total.mean.gp1>=long_coverage_threshold | 
                     PDUItable$total.mean.gp2>=long_coverage_threshold) & 
                (PDUItable$long.mean.gp1>=long_coverage_threshold | 
                     PDUItable$long.mean.gp2>=long_coverage_threshold) &
                !(PDUItable$total.mean.gp1<long_coverage_threshold & 
                      PDUItable$long.mean.gp1<long_coverage_threshold)
            data <- as.data.frame(mcols(PDUItable))
            pval <- apply(data[,c("long.mean.gp1", 
                                  "short.mean.gp1", 
                                  "long.mean.gp2", 
                                  "short.mean.gp2")], 
                          1, function(.d) 
                              fisher.test(matrix(floor(.d), ncol=2))$p.value)
            PDUI.gp1 <- 
                data$long.mean.gp1/(data$long.mean.gp1 + data$short.mean.gp1)
            PDUI.gp2 <- 
                data$long.mean.gp2/(data$long.mean.gp2 + data$short.mean.gp2)
            dPDUI <- PDUI.gp2 - PDUI.gp1
            PDUItable$PDUI.gp1 <- PDUI.gp1
            PDUItable$PDUI.gp2 <- PDUI.gp2
            PDUItable$dPDUI <- dPDUI
            PDUItable$logFC <- 
                abs(log2(PDUI.gp1+.Machine$double.xmin) - 
                        log2(PDUI.gp2+.Machine$double.xmin)) >= 
                PDUI_logFC_cutoff
        }else{
            PDUItable$test_status <- 
                PDUItable$total.mean.gp1>long_coverage_threshold & 
                PDUItable$long.mean.gp1>long_coverage_threshold
            data.long <- PDUItable$long.gp1
            data.short <- PDUItable$short.gp1
            data <- log2(cbind(data.long, data.short)+0.1)
            treatments <- cbind(long=c(rep(c(1,0),c(ncol(data.long), 
                                                    ncol(data.short)))), 
                                short=c(rep(c(0,1), c(ncol(data.long), 
                                                      ncol(data.short)))))
            design <- model.matrix(~-1+treatments)
            colnames(design) <- c("long", "short")
            fit <- lmFit(data, design)
            contrast.matrix<-makeContrasts(contrasts="long-short",
                                           levels=design)
            fit <- contrasts.fit(fit,contrast.matrix)
            fit <- eBayes(fit)
            data <- topTable(fit, number=nrow(fit), sort.by="none")
            pval <- data$P.Value
            PDUItable$dPDUI <- PDUItable$PDUI.gp1 <- 
                PDUItable$long.mean.gp1/(PDUItable$long.mean.gp1 + 
                                             PDUItable$short.mean.gp1)
            PDUItable$logFC <- TRUE
        }
    }
    PDUItable$pval <- pval
    PDUItable$adjPval <- p.adjust(pval, method="BH")
    
    PDUItable$filterPass <- PDUItable$adjPval<adjusted.P_val.cutoff & 
        abs(PDUItable$dPDUI)>dPDUI_cutoff & 
        PDUItable$test_status & 
        PDUItable$logFC
    PDUItable$source <- NULL
    PDUItable$logFC <- NULL
    
    PDUItable <- PDUItable[order(PDUItable$filterPass, 
                                 abs(PDUItable$dPDUI), 
                                 decreasing=TRUE)]
}

Any scripts or data that you put into this service are public.

InPAS documentation built on Nov. 8, 2020, 5:03 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

InPAS
InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

R/utr3UsageEstimation.R
In InPAS: InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

Defines functions utr3UsageEstimation coverageCDS

Documented in utr3UsageEstimation

Try the InPAS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

InPAS InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

R/utr3UsageEstimation.R In InPAS: InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

Defines functions utr3UsageEstimation coverageCDS

Documented in utr3UsageEstimation

Try the InPAS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

InPAS
InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data

R/utr3UsageEstimation.R
In InPAS: InPAS: a bioconductor package for the identification of novel alternative PolyAdenylation Sites (PAS) using RNA-seq data