flowAI: Automatic and interactive quality control for flow cytometry data

# FlowQ test
require(RColorBrewer)
require(flowCore)
require(reshape2)
require(plyr)
require(scales)
require(ggplot2)

# Guess which channel captures time in a exprs, flowFrame or flowset
findTimeChannel <- function(xx) {
    time <- grep("^Time$", colnames(xx), value = TRUE, ignore.case = TRUE)[1]
    if (is.na(time)) {
        if (is(xx, "flowSet") || is(xx, "ncdfFlowList"))
            xx <- exprs(xx[[1]]) else if (is(xx, "flowFrame"))
                xx <- exprs(xx)
            cont <- apply(xx, 2, function(y) all(sign(diff(y)) >= 0))
            time <- names(which(cont))
    }
    if (!length(time) || length(time) > 1)
        time <- NULL
    return(time)
}

# Check if the Fcs file is ordered according to time otherwise it order it.
ord_fcs_time <- function(x, timeCh= "Time"){
    xord <- order(exprs(x)[, timeCh])

    if( !identical(xord, 1:nrow(x)) ){
        warning(paste0("Expression data in the file ", basename(keyword(x)$FILENAME),
                       " were not originally ordered by time."))
        params <- parameters(x)
        keyval <- keyword(x)
        sub_exprs <- exprs(x)[xord, ]
        newx <- flowFrame(exprs = sub_exprs, parameters = params,
                          description = keyval)
        return(newx)
    }else{
        return(x)
    }
}


flow_rate_bin <- function(x, second_fraction = 0.1, timeCh = "Time", timestep = 0.1){

    xx <- exprs(x)[, timeCh]
    idx <- c(1:nrow(x))
    lenx <- length(xx)                                                  # num of time ticks

    endsec <- ceiling(timestep * max(xx))                 # total seconds of the experiment
    tbins <- seq(0, endsec/timestep, by = second_fraction/timestep)             # time bins
    secbin <- seq(0, endsec, by = second_fraction)               # bin expressed in seconds
    minbin <- round(secbin/60, 3)                                # bin expressed in minutes
    tbCounts <- c(0, hist(xx, tbins, plot = FALSE)$counts)  # number of events per time bin

    nrBins <- length(tbins) - 1
    expEv <- lenx/(nrBins)           # median(tbCounts) # expected number of events per bin
    binID <- do.call(c, mapply(rep, x = 1:length(tbCounts), times = tbCounts,
                               SIMPLIFY = FALSE))

    if (length(idx) != length(binID))
        stop("length of cell ID not equal length of bin ID")

    flowRateData <- list(frequencies = cbind(tbins, minbin, secbin, tbCounts),
                         cellBinID = data.frame(cellID = idx, binID = binID),
                         info = data.frame(second_fraction = second_fraction,
                                           expFrequency = expEv, bins = nrBins))
    return(flowRateData)
}


flow_rate_plot <- function(flowRateData, lowerRateThres, upperRateThres,
                           lowerTimeCut, UpperTimeCut) {

    frequencies <- as.data.frame(flowRateData$frequencies)
    second_fraction <- flowRateData$info$second_fraction
    short_period <- quantile(frequencies$secbin, seq(0,1, length.out = 4))
    long_period <- quantile(frequencies$secbin, seq(0,1, length.out = 3*10 + 1))

    ## flow rate graph(frg)
    frg <- ggplot(frequencies, aes_string(x="secbin", y="tbCounts")) + geom_line(colour="red") +
        theme_bw() + theme(panel.grid.major = element_blank(),
                           panel.grid.minor = element_blank(),
                           text=element_text(size = 14))
    frg <- frg + geom_vline(xintercept=short_period, color="gray60") +
        geom_vline(xintercept=long_period, linetype = 2, color="gray60")

    frg <- frg + geom_hline(yintercept=c(lowerRateThres, upperRateThres), color="blue",
                            linetype = "longdash", size = 1.2, show_guide = TRUE)
    frg <- frg + geom_vline(xintercept=c(lowerTimeCut, UpperTimeCut), color="blue",
                            linetype = "longdash", size = 1.2, show_guide = TRUE)

    frg <- frg + labs(x= "Seconds", y= paste0("Number of cells per 1/", 1/second_fraction, " second"),
                      title= "Flow Rate Plot")

    return(frg)
}

flow_rate_check <- function(flowRateData, lowerRateThres, upperRateThres,
                           lowerTimeCut, UpperTimeCut) {

    frequencies <- as.data.frame(flowRateData$frequencies)
    cellBinID <- flowRateData$cellBinID

    goodcell_x <- which(frequencies$secbin < UpperTimeCut & frequencies$secbin > lowerTimeCut)
    goodcell_y <- which(frequencies$tbCounts < upperRateThres & frequencies$tbCounts > lowerRateThres)

    flowRateQC <- cellBinID$cellID[ cellBinID$binID %in% intersect(goodcell_x, goodcell_y) ]
    cat("flow rate high Q events: ", length(flowRateQC), "\n")
    return(flowRateQC)
}



flow_signal_bin <- function(x, channels = NULL, binSize=500, timeCh="Time", 
    timestep=0.1, TimeChCheck = NULL) {

    if (is.null(channels) || missing(channels) || is.na(channels)) {
        parms <- setdiff(colnames(x), timeCh)
    } else {
        if (!all(channels %in% colnames(x)))
            stop("Invalid channel(s)")
        parms <- channels
    }

    if (missing(binSize) || is.null(binSize) || is.na(binSize))
        binSize <- 500
   
    ### Retriving time and expression info
    exp <- exprs(x)
    if (!is.null(TimeChCheck)) {
        timex <- seq(from = 0, length.out = nrow(x), by = 0.1)
    }else{
        timex <- exp[, timeCh]
    }
    yy <- exp[, parms]  # channels data
    idx <- c(1:nrow(x))
    seconds <- timex * timestep
    lenSec <- length(seconds)  # num of time ticks
    uniSeconds <- unique(seconds)  # num of unique time tick
    nrBins <- floor(lenSec/binSize)  # num of bins

    if (length(uniSeconds) < nrBins || lenSec < binSize)
        stop("Improper bin size")

    cf <- c(rep(1:nrBins, each = binSize), rep(nrBins + 1, lenSec - nrBins * binSize))  # id bins
    stopifnot(length(cf) == lenSec)
    tmpx <- split(seconds, cf)
    xx2 <- sapply(tmpx, mean)       # mean of each time bin  (x axis)
    yy2 <- as.matrix(ddply(as.data.frame(yy), .(cf), colwise(median)))[, -1]

    return(list(exprsBin = cbind(timeSec = xx2, yy2), cellBinID = data.frame(cellID = idx, binID = cf),
                bins = length(unique(cf)), binSize = binSize))
}


flow_signal_plot <- function(flowSignalData, lowerBinThres, upperBinThres) {

    exprsBin <- flowSignalData$exprsBin

    binID <- 1:nrow(exprsBin)
    teCh <- grep("Time|time|Event|event", colnames(exprsBin), value = TRUE)
    parms <- setdiff(colnames(exprsBin), teCh)
    dataORIG <- exprsBin[, parms]     # first channel is time
    data <- as.data.frame(dataORIG)
    data$binID <- binID

    longdata <- melt(data, id.vars = "binID", variable.name = "marker", value.name = "value")
    FS_graph <- ggplot(longdata, aes(x = binID, y = value, col = marker), environment = environment()) +
        geom_line() + facet_grid(marker ~ ., scales = "free") +
        labs(x = "Bin ID", y = "Median Intensity value") + theme_bw() +
        theme(strip.text.y = element_text(angle = 0, hjust = 1), axis.text = element_text(size = 10),
              axis.title = element_text(size = 15), legend.position = "none") +
        scale_x_continuous(breaks= pretty_breaks(n = 10)) +
        scale_y_continuous(breaks= pretty_breaks(n = 3)) +
        geom_rect(aes(xmin = lowerBinThres, xmax = upperBinThres, ymin = -Inf, ymax = Inf), fill = "orange", linetype = 0, alpha = 0.005)

    return(FS_graph)
}


flow_signal_check <- function(flowSignalData, lowerBinThres, upperBinThres) {

    exprsBin <- flowSignalData$exprsBin
    cellBinID <- flowSignalData$cellBinID

    goodBins <- cellBinID$binID < upperBinThres & cellBinID$binID > lowerBinThres
    FlowSignalQC <- cellBinID$cellID[goodBins]

    cat("flow signal check: ", length(FlowSignalQC), "\n")
    return(FlowSignalQC)
}


flow_margin_check <- function(x,  margin_channels = NULL, side = "both") {

    if (is.null(margin_channels)) {
        teCh <- grep("Time|time|Event|event", colnames(x), value = TRUE)
        parms <- setdiff(colnames(x), teCh)
    } else {
        if (!all(margin_channels %in% colnames(x)))
            stop("Invalid channel(s)")
        parms <- margin_channels
    }
    scatter_parms <- grep("FSC|SSC", parms, value = TRUE)

    xx <- c(1:nrow(x))
    yy <- x@exprs[, parms]
    range <- range(x)
    lenx <- length(xx)

    ## lower check
    if (side == "lower" || side == "both") {

        out_neg_range <- apply(yy, 2, function(x) {
            neg <- which(x < 0)
            # Zscores <- (0.6745*(x[neg] + median(x[neg])))/mad(x[neg]) ## it
            # calculates the Zscore outneg <- neg[which(Zscores < -3.5)]
            min_value <- (-3.5 * mad(x[neg]) + (0.6745 * median(x[neg])))/0.6745  # -3.5 is the default threshold
            if (is.na(min_value)) {
                min(x) - 1
            } else {
                min_value
            }
        })
    }

    # n. bad cells for each channel
    if (side == "lower" || side == "both") {
        neg_bad_len <- sapply(parms, function(x) length(xx[yy[, x] <= out_neg_range[x]]))
    }
    if (side == "upper" || side == "both") {
        pos_bad_len <- sapply(parms, function(x) length(xx[yy[, x] >= range[2, x]]))
    }

    # badcellIDs
    if (side == "lower" || side == "both") {
        lowID <- do.call(c, lapply(parms, function(ch) {
            xx[yy[, ch] <= out_neg_range[ch]]
        }))
        if(length(scatter_parms) != 0){   ### check for values less than 0 in scatter parameters
            minSc <- apply(yy[,scatter_parms], 1, function(x){
                min(x)
            })
            low_scatter_ID <- which(minSc < 0)
            lowID <- unique(c(lowID, low_scatter_ID))
        }
    }
    if (side == "upper" || side == "both") {
        upID <- do.call(c, lapply(parms, function(ch) {
            xx[yy[, ch] >= range[2, ch]]
        }))
    }

    if (side == "lower") {
        summary_bad_cells <- data.frame(lower_range = c(neg_bad_len,
                                                        total_SUM = length(lowID), total_UNIQUE = length(unique(lowID))))
        bad_lowerIDs <- unique(lowID)
        bad_upperIDs <- NULL
        badCellIDs <- unique(lowID)
    } else if (side == "upper") {
        summary_bad_cells <- data.frame(upper_range = c(pos_bad_len,
                                                        total_SUM = length(upID), total_UNIQUE = length(unique(upID))))
        bad_lowerIDs <- NULL
        bad_upperIDs <- unique(upID)
        badCellIDs <- unique(upID)
    } else {
        summary_bad_cells <- data.frame(lower_range = c(neg_bad_len,
                                                        total_SUM = length(lowID), total_UNIQUE = length(unique(lowID))),
                                        upper_range = c(pos_bad_len,
                                                        total_SUM = length(upID), total_UNIQUE = length(unique(upID))))
        bad_lowerIDs <- unique(lowID)
        bad_upperIDs <- unique(upID)
        badCellIDs <- unique(c(lowID,upID))
    }

    goodCellIDs <- setdiff(xx, badCellIDs)

    cat("margin check:", length(goodCellIDs), "\n")

    return(list(goodCellIDs = goodCellIDs, bad_lowerIDs = bad_lowerIDs,
                bad_upperIDs = bad_upperIDs, events = lenx))
}


###  graph showing where the anomalies mostly happened
flow_margin_plot <- function(FlowMarginData, binSize = 500) {

    tot_events <- FlowMarginData$events
    bad_lowerIDs <- FlowMarginData$bad_lowerIDs
    bad_upperIDs <- FlowMarginData$bad_upperIDs

    if (missing(binSize) || is.null(binSize) || is.na(binSize))
        binSize <- 500
    nrBins <- floor(tot_events/binSize)

    cf <- c(rep(1:nrBins, each = binSize), rep(nrBins + 1, tot_events - nrBins * binSize))
    tmpx <- split(1:tot_events, cf)

    if(length(bad_lowerIDs) != 0 && length(bad_upperIDs) != 0){
        lowline <- sapply(tmpx, function(x){
            length(which(bad_lowerIDs %in% x))
        })
        upline <- sapply(tmpx, function(x){
            length(which(bad_upperIDs %in% x))
        })
        ymax <- max(lowline, upline)
        plot(lowline, type ="l", col = "blue", bty ="n",
            ylim = c(0, ymax), xlab = "segment ID",
            ylab = "Number of cells removed" )
        lines(upline, col = "red")
        legend("top", c("Negative Outliers", "Upper Margine Events"), lty = 1,bty = "n", cex = 0.7,
            col = c("blue", "red"))
    }else if( length(bad_lowerIDs) != 0 && length(bad_upperIDs) == 0){
        lowline <- sapply(tmpx, function(x){
            length(which(bad_lowerIDs %in% x))
        })
        plot(lowline, type ="l", col = "blue", bty ="n", xlab = "segment ID",
            ylab = "Number of cells removed" )
        legend("top", c("Negative Outliers"), lty = 1,bty = "n", cex = 0.7,
            col = "blue")
    }else if( length(bad_lowerIDs) == 0 && length(bad_upperIDs) != 0){
        upline <- sapply(tmpx, function(x){
            length(which(bad_upperIDs %in% x))
        })
        plot(upline, type ="l", col = "red", bty ="n", xlab = "segment ID",
            ylab = "Number of cells removed" )
        legend("top", c("Upper Margine Events"), lty = 1,bty = "n", cex = 0.7,
            col = "red")
    }
}


## create new flowFrame with the parameter indicating good and bad cells
addQC <- function(QCvector, sub_exprs, params, keyval){
    
    rs <- attr(sub_exprs, "ranges")
    rs <- c(rs, rs[1])
    sub_exprs <- cbind(sub_exprs, QCvector)
    attr(sub_exprs, "ranges") <- rs
    NN <- as.numeric(keyval["$PAR"]) + 1
    names(dimnames(sub_exprs)[[2]]) <- sprintf("$P%sN", 1:NN)
    pnr <- paste0("$P", NN, "R")
    pnb <- paste0("$P", NN, "B")
    pne <- paste0("$P", NN, "E")
    pnn <- paste0("$P", NN, "N")
    pns <- paste0("$P", NN, "S")
    flowCorePnRmax <- paste0("flowCore_$P", NN, "Rmax")
    flowCorePnRmin <- paste0("flowCore_$P", NN, "Rmin")
    o <- params@data
    o[length(o[,1]) + 1,] <- c("QC", "bad > 10,000", as.numeric(keyval$`$P1R`), 0, 20000)
    rownames(o)[length(o[,1])] <- paste("$P", NN, sep = "")
    
    outFCS <- new("flowFrame", exprs=sub_exprs, parameters=new("AnnotatedDataFrame",o), description=keyval)
    keyword(outFCS)[pnr] <- max(20000, keyword(outFCS)$`$P1R`)
    keyword(outFCS)[pnb] <- keyword(outFCS)$`$P1B`
    keyword(outFCS)[pne] <- "0,0"
    keyword(outFCS)[pnn] <- "QC"
    keyword(outFCS)[pns] <- "bad > 10,000"
    keyword(outFCS)$`$PAR` <- NN
    keyword(outFCS)[flowCorePnRmax] <- 20000
    keyword(outFCS)[flowCorePnRmin] <- 0
    outFCS
}
giannimonaco/flowAI documentation built on July 29, 2024, 6:22 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
giannimonaco/flowAI
Automatic and interactive quality control for flow cytometry data

inst/shiny/global.R
In giannimonaco/flowAI: Automatic and interactive quality control for flow cytometry data

R Package Documentation

Browse R Packages

We want your feedback!

giannimonaco/flowAI Automatic and interactive quality control for flow cytometry data

inst/shiny/global.R In giannimonaco/flowAI: Automatic and interactive quality control for flow cytometry data

R Package Documentation

Browse R Packages

We want your feedback!

giannimonaco/flowAI
Automatic and interactive quality control for flow cytometry data

inst/shiny/global.R
In giannimonaco/flowAI: Automatic and interactive quality control for flow cytometry data