R/util.R

Defines functions .checkVert .getTotal .listFilesInOrder .inferAnalyzed .capitalize .summarySE .getLineTypes .saveAs .readSummary .substituteStringInFile .emptyPlot .loadMatrixFromDF .findRepertoires

Documented in .capitalize .checkVert .emptyPlot .findRepertoires .getLineTypes .getTotal .inferAnalyzed .loadMatrixFromDF .readSummary .saveAs .substituteStringInFile .summarySE

# single plot uses light blue colour
BLUEHEX <- "#56B4E9"

# plot sizes
V_WIDTH <- 8
V_HEIGHT <- 5
H_WIDTH <- 5
H_HEIGHT <- 8
VENN_WIDTH <- 8
VENN_HEIGHT <- 8
V_WIDTH_L <- 12
V_HEIGHT_L <- 7.5

# AbSeq's output directory structure(s)
RESULT_DIR <- "auxiliary"
AUX_DIR <- "hdf"
# AbSeq's analysis directory names
ABSEQ_DIR_ANNOT <- "annot"
ABSEQ_DIR_PROD <- "productivity"
ABSEQ_DIR_ABUN <- "abundance"
ABSEQ_DIR_DIV <- "diversity"
ABSEQ_DIR_PAIR <- "clonotype_analysis"
ABSEQ_DIR_PRIM <- "primer_specificity"
ABSEQ_DIR_5UTR <- "utr5"
ABSEQ_DIR_SEC <- "secretion"
ABSEQ_HTML_DIR <- "report"
ABSEQ_NESTED_HTML_DIR <- "html_files"

# parameter file from AbSeq's run
ANALYSIS_PARAMS <- "analysis.params"

# AbSeq's summary file about the repertoire - raw/annot/prod counts
ABSEQ_SUMMARY <- "summary.txt"

# These are the "key"s from ABSEQ_SUMMARY file
# They should be in the format of: (for example)
# RawReads:<number>
# AnnotatedReads:<number>
ABSEQ_RAW_READ_COUNT_KEY <- "RawReads"
ABSEQ_ANNOT_READ_COUNT_KEY <- "AnnotatedReads"
ABSEQ_FILT_READ_COUNT_KEY <- "FilteredReads"
ABSEQ_PROD_READ_COUNT_KEY <- "ProductiveReads"


#' Checks if abseqPy has a metadata line that suggests
#' the orientation
#'
#' @param filename csv filename
#'
#' @return True if CSV metadata says "plot vertically"
.checkVert <- function(filename) {
    f <- file(filename, "r")
    res <- grepl("vert", readLines(f, n = 1), fixed = TRUE)
    close(f)
    return(res)
}

#' Get total number of samples (n)
#'
#' @description Often enough, the CSV values supplied do not contain
#' raw counts but percentages (so this value will let us know exactly
#' the sample size).
#'
#' @param filename csv filename
#'
#' @return string, sample size.
.getTotal <- function(filename) {
    f <- file(filename, "r")
    res <- unlist(strsplit(readLines(f, n = 1), "="))[2]
    close(f)
    return(res)
}

.listFilesInOrder <- function(path, pattern, expectedRet = c(1)) {
    # Returns files in order of path.
    # i.e overrides list.files' behaviour of sorted files
    # This is crucial because we are assuming sampleNames is
    # in 1-1 correspondance with dataframes.
    # This can be achieved by manually iterating over all sample path
    # in the provided vector of path and appending to a vector.
    # Sometimes, (like abundance) list.files will return more that one
    # matching file (gene, family, variant), so expectedRet is there to ensure
    # that we aren't doing something silly. expectRet is a vector because
    # sometimes there might be more than one possible configuration.
    # EG: abundance plot may or maynot have D gene when analyzing heavy/light
    # chains
    # Returns: ordered vector of files (according to provided path's ordering)
    orderedFiles <- c()
    for (p in path) {
        retval <- list.files(
            path = p,
            pattern = pattern,
            full.names = TRUE,
            recursive = TRUE
        )
        # short circuit once at least one of the paths don't have the required
        # file. (Can't compare all the samples if one of them is missing!)
        if (length(retval) == 0) {
            return(c())
        }
        if (!(length(retval) %in% expectedRet)) {
            stop("Expected either ", paste(expectedRet, collapse = ", "),
                 " files to be found, but only ",
                 length(retval),
                 " were found.")
        }
        orderedFiles <- c(orderedFiles,  retval)
    }
    return(orderedFiles)
}

#' Returns all samples found under \code{sampleDirectory}
#'
#' @param sampleDirectory string, path to sample directory.
#'
#' @return un-normalized path to all samples under \code{sampleDirectory}
.inferAnalyzed <- function(sampleDirectory) {
    everything <- list.files(sampleDirectory)
    fullPath <- lapply(everything, function(x) {
        file.path(sampleDirectory, x)
    })
    return(everything[unlist(lapply(fullPath, dir.exists))])
}

#' Helper function to capitalize the first letter of \code{str}
#'
#' @param str string type
#'
#' @return string, \code{str} capitalized
.capitalize <- function(str) {
    firstLetter <- substr(str, 1, 1)
    rest <- substr(str, 2, nchar(str))
    return(paste0(toupper(firstLetter), rest))
}

#' Summary of dataframe
#'
#' @description Gives count, mean, standard deviation,
#' standard error of the mean, and confidence interval (default 95\%).
#'
#' adapted from http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/#Helper functions
#'
#' @param data a data frame.
#' @param measurevar the name of a column that contains the variable to be summariezed
#' @param groupvars a vector containing names of columns that contain grouping variables
#' @param na.rm a boolean that indicates whether to ignore NA's
#' @param conf.interval the percent range of the confidence interval (default is 95\%)
#' @param .drop logical.
#'
#' @import plyr stats
#'
#' @return dataframe
.summarySE <-
    function(data = NULL,
             measurevar,
             groupvars = NULL,
             na.rm = FALSE,
             conf.interval = .95,
             .drop = TRUE) {
        # New version of length which can handle NA's: if na.rm==T, don't count them
        length2 <- function(x, na.rm = FALSE) {
            if (na.rm) {
                sum(!is.na(x))
            } else {
                length(x)
            }
        }

        # This does the summary. For each group's data frame, return a vector with
        # N, mean, and sd
        datac <- ddply(
            data,
            groupvars,
            .drop = .drop,
            .fun = function(xx, col) {
                c(
                    N    = length2(xx[[col]], na.rm = na.rm),
                    mean = mean   (xx[[col]], na.rm = na.rm),
                    sd   = sd     (xx[[col]], na.rm = na.rm)
                )
            },
            measurevar
        )

        # Rename the "mean" column
        datac <- rename(datac, c("mean" = measurevar))

        datac$se <-
            datac$sd / sqrt(datac$N)  # Calculate standard error of the mean

        # Confidence interval multiplier for standard error
        # Calculate t-statistic for confidence interval:
        # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
        ciMult <- qt(conf.interval / 2 + .5, datac$N - 1)
        datac$ci <- datac$se * ciMult

        return(datac)
    }

#' Helper function to return line types by importance based on provided
#' CD/Fs regions
#'
#' @description In the aesthetics of diversity plots (rarefaction, recapture,
#' and duplication), the line types should emphasise the most important
#' antibody region, they're ranked in ascending order of:
#' "FR4", "FR1", "FR2", "FR3", "CDR1", "CDR2", "CDR3", "V".
#'
#' @param regions a list/vector of strings (regions)
#'
#' @return vector of strings, each corresponding to the appropriate line type
#' for \code{regions}.
.getLineTypes <- function(regions) {
    if (length(regions) > 6) {
        stop("No line types for regions with length > 6 ")
    }
    regions <- unlist(lapply(regions, toupper))
    # order of importance: min -> max
    lvls <-
        c("FR4", "FR1", "FR2", "FR3", "CDR1", "CDR2", "CDR3", "V")
    # order of importance: max -> min
    lines <-
        c("solid",
          "twodash",
          "dotted",
          "dotdash",
          "longdash",
          "dashed")

    factorRegions <- factor(regions, levels = lvls)
    return(lines[order(factorRegions, decreasing = TRUE)])
}


#' Saves ggplot object as a Rdata file.
#'
#' @description It's a convinient function that does the check and saves
#' at the same time, for brevity within other areas of the code (to eliminate
#' repeated if checks).
#'
#' @import tools
#'
#' @param .save logical type. Whether or not we should save.
#' @param filename string.
#' @param plot ggplot object.
#'
#' @return nothing
.saveAs <- function(.save, filename, plot) {
    if (.save) {
        fname <- sub(tools::file_ext(filename), "Rdata", filename)
        save(file = fname, list = c("plot"))
    }
}


#' Return value specifed by key from AbSeq's summary file
#'
#' @param sampleRoot sample's root directory. For example,
#' \code{/path/to/<outputdir>/reports/<sample_name>}.
#' @param key character type. Possible values are (though they might change)
#' \itemize{
#'    \item{RawReads}
#'    \item{AnnotatedReads}
#'    \item{FilteredReads}
#'    \item{ProductiveReads}
#' }
#' @return value associated with key from summary file. "NA" (in string) if the field
#' is not available refer to util.R for the key values
.readSummary <- function(sampleRoot, key) {
    fname <- file.path(sampleRoot, ABSEQ_SUMMARY)
    con <- file(fname, "r")
    lines <- readLines(con)
    close(con)
    for (line in lines) {
        if (grepl(key, line, fixed = TRUE)) {
            return(strsplit(line, ":")[[1]][2])
        }
    }
    return("NA")
}


#' Substitutes the first occurance of `key` with `value` in `filename`
#'
#' @param filename character type
#' @param key character type
#' @param value character type
#' @param fixed logical type
#'
#' @return None
.substituteStringInFile <-
    function(filename, key, value, fixed = FALSE) {
        con <- file(filename, "r")
        lines <- readLines(con)
        close(con)
        lines <- sub(key, value, lines, fixed = fixed)
        con <- file(filename, "w")
        cat(lines, file = con, sep = "\n")
        close(con)
    }


#' Creates and returns an empty plot
#'
#' @import ggplot2
#'
#' @return empty ggplot2 object
.emptyPlot <- function() {
    # placeholder plot - prints nothing at all
    # https://www.r-bloggers.com/ggplot2-cheatsheet-for-visualizing-distributions/
    # https://github.com/mikessh/vdjtools/blob/master/src/main/resources/rscripts/intersect_pair_scatter.r
    g <- ggplot() + geom_point(aes(1, 1), colour = "white") +
        theme(
            plot.background = element_blank(),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            panel.background = element_blank(),
            axis.title.x = element_blank(),
            axis.title.y = element_blank(),
            axis.text.x = element_blank(),
            axis.text.y = element_blank(),
            axis.ticks = element_blank(),
            plot.margin = unit(c(3, -5.5, 4, 3), "mm")
        )
    return(g)
}


#' Given a dataframe with the columns "from", "to", and value.var, return
#' a symmetric matrix (with diagonal values = diag). I.e. a call to
#' isSymmetric(return_value_of_this_function) will always be TRUE.
#'
#'
#' @import reshape2
#'
#' @param dataframe dataframe with 3 required columns, namely:
#' +---------------------------------------+
#' | from | to | value.var | ...           |
#' +---------------------------------------+
#' |      |    |           |               |
#' +---------------------------------------+
#' where value.var is the string provided in the function parameter
#' @param value.var the column to use as the matrix value
#' @param diag what should the diagonal values be if the dataframe doesn't provide them
#' @param unidirectional logical type. If the dataframe provided has the reverse
#' pairs (i.e. a from-to pair AND a to-from pair with the save values in the
#' value.var column), then this should be FALSE. Otherwise, this function will
#' flip the from-to columns to generate a symmetric dataframe (and hence, a
#' symmetric matrix).
#'
#' @return a symmetric matrix with rownames(mat) == colnames(mat)
#' The diagonal values are filled with diag if the dataframe itself doesn't have
#' diagonal data
.loadMatrixFromDF <-
    function(dataframe,
             value.var,
             diag,
             unidirectional = TRUE) {
        if (unidirectional) {
            # swap the columns "from" and "to", while the others remain the same
            df.r <-
                dataframe[, c(2, 1, tail(seq_along(names(dataframe)), -2))]
            # rename the columns (after swapping, it's to - from, need it to be from - to)
            names(df.r) <- names(dataframe)
            # rowbind the dataframes into one
            df.f <- rbind(dataframe, df.r)
        } else {
            # bidirectional dataframe doesn't require mirror-ing
            df.f <- dataframe
        }
        mat <-
            reshape2::acast(df.f, from ~ to, value.var = value.var, fill = diag)
        # make sure the matrix is symmetric
        mat <- mat[, rownames(mat)]
        stopifnot(isSymmetric(mat))
        mat
    }


#' Given a directory = <abseqPy_outputdir>/RESULT_DIR/, returns the directories (repositories) in
#' 'directory'. That is, will not return any sample_vs_sample directories.
#' This is done by asserting that a 'repository'
#' must have an (analysis.params) file, and a summary.txt file.
#'
#' A sample_vs_sample directory will not have these files.
#'
#' @param directory string. Path up until <abseqPy_outputdir>/RESULT_DIR/
#'
#' @return vector of strings that are samples in 'directory', note, this is NOT
#' a full path, but just the sample/repertoire name itself
.findRepertoires <- function(directory) {
    repos <- list.files(directory, full.names = TRUE)
    # given a directory (d), return True if d is a repository
    .isRepo <- function(d) {
        all(c("analysis.params", "summary.txt") %in% list.files(d))
    }
    vapply(Filter(.isRepo, repos), basename, USE.NAMES = FALSE, FUN.VALUE = "")
}
malhamdoosh/abseqR documentation built on May 24, 2019, 12:36 a.m.