R/postEnrichment.R

Defines functions plotTopLOLAEnrichments writeCombinedEnrichment writeDataTableSplitByColumn splitDataTable extractEnrichmentOverlaps

Documented in extractEnrichmentOverlaps plotTopLOLAEnrichments splitDataTable writeCombinedEnrichment writeDataTableSplitByColumn

######################################################################
# POST-ENRICHMENT - Functions for processing enrichment results
######################################################################

#' Given a single row from an enrichment table calculation,
#' finds the set of overlaps between the user set and the test set.
#' You can then use these, for example, to get sequences for those regions.
#'
#' @param locResult Results from runLOLA function
#' @param userSets User sets passed to the runLOLA function
#' @param regionDB Region database used
#' @return userSets overlapping the supplied database entry.
#'
#' @export
#' @example
#' R/examples/example.R
extractEnrichmentOverlaps = function(locResult, userSets, regionDB) {
	userSets = listToGRangesList(userSets)
	regionGRLID = which(regionDB$regionAnno$filename %in% locResult$filename &
	regionDB$regionAnno$collection %in% locResult$collection)

	userSetString = locResult[,userSet]
	userSet = userSets[[userSetString]]
	userSet[queryHits(findOverlaps(userSet,regionDB$regionGRL[[regionGRLID]]))]
}


#' Efficiently split a data.table by a column in the table
#'
#' @param DT Data.table to split
#' @param splitFactor Column to split, which can be a character vector
#'	or an integer.
#' @return	List of data.table objects, split by column
# @examples
# DT = data.table(letters, grp = rep(c("group1", "group2"), 13))
# splitDataTable(DT, "grp")
# splitDataTable(DT, 2)
splitDataTable = function(DT, splitFactor) {
	if (is.numeric(splitFactor)) {
		splitFactor = colnames(DT)[splitFactor]
		message("Integer splitFactor, changed to: ", splitFactor)
	}
	lapply( split(seq_len(nrow(DT)), DT[, get(splitFactor)]), function(x) DT[x])
}



#' Given a data table and a factor variable to split on,
#' efficiently divides the table and then writes the different splits
#' to separate files, named with filePrepend and numbered according
#' to split.
#'
#' @param DT	data.table to split
#' @param splitFactor	column of DT to split on
#' @param filePrepend	notation string to prepend to output files
#' @param orderColumn	column of DT to order on (defaults to the first column)
#' @return NULL
#'
#' @return number of splits written

writeDataTableSplitByColumn = function(DT, splitFactor, filePrepend="",
	orderColumn=NULL) {
	oldScipen = options(scipen = 4) #use scientific notation for pvalues.
	on.exit(options(oldScipen), add = TRUE)
	if (is.null(orderColumn)) {
		orderColumn = colnames(DT)[1]	#default order by first col.
	}
	length(
lapply( split(seq_len(nrow(DT)), DT[, get(splitFactor)]),
		function(x) {
			fileName = paste0(filePrepend, DT[x,get(splitFactor)][1], ".tsv")
			if (file.exists(fileName)) {
				message("Overwriting ", fileName , "...")
			} else {
				message(fileName)
			}
			write.table(DT[x,][order(get(orderColumn)),],
				file=fileName, quote=FALSE, row.names=FALSE, sep="\t")
		}
	)
	)
}


#' Function for writing output all at once: combinedResults is an table
#' generated by "locationEnrichment()" or by rbinding category/location results.
#' Writes all enrichments to a single file, and also spits out the same data
#' divided into groups based on userSets, and Databases, just for convenience.
#' disable this with an option.
#'
#' @param combinedResults	enrichment results object
#' @param outFolder	location to write results on disk
#' @param includeSplits	also include individual files for each user set and database?
#' @return No return value.
#'
#' @export
#' @example
#' R/examples/example.R
writeCombinedEnrichment = function(combinedResults, outFolder=NULL,
	includeSplits=TRUE) {
	if (outFolder == "" | is.null(outFolder)) {
		outFolder = ""
	} else if (substr(outFolder, nchar(outFolder), nchar(outFolder)) != "/") {
		outFolder = paste0(outFolder, "/")
	}

	dir.create(outFolder, showWarnings=FALSE)
	if (includeSplits) {
		if (combinedResults[,length(unique(userSet))] > 1) {
			writeDataTableSplitByColumn(
				combinedResults[order(pValueLog, decreasing=TRUE),],
				splitFactor="userSet", filePrepend=paste0(outFolder, "userSet_")
			)
		}
		if (combinedResults[,length(unique(collection))] > 1) {
			writeDataTableSplitByColumn(
				combinedResults[order(pValueLog, decreasing=TRUE),],
				splitFactor="collection", filePrepend=paste0(outFolder, "col_")
			)
		}
	}
	if (file.exists(paste0(outFolder, "allEnrichments.tsv")))
		message("Overwriting ", paste0(outFolder, "allEnrichments.tsv"), "...")
	write.table(
		combinedResults[order(pValueLog, decreasing=TRUE),], sep="\t",
		file=paste0(outFolder, "allEnrichments.tsv"), row.names=FALSE, quote=FALSE,
	)
}

#' Given some results (you grab the top ones on your own), 
#' this plots a barplot visualizing their odds ratios.
#' @param data A results table returned from runLOLA()
#' @return Returns a ggplot2 plot object.
#'
#' @export
#' @example
#' R/examples/example.R
plotTopLOLAEnrichments = function(data) {
	if ( ! requireNamespace("ggplot2")) {
		message("This function requires ggplot2")
		return()
	}
	data = data[!duplicated(description),]
	g = ggplot2::ggplot(data, ggplot2::aes(x=factor(description, levels=rev(unique(description))), y=oddsRatio)) +
		ggplot2::geom_bar(stat="identity") + ggplot2::coord_flip() + ggplot2::ggtitle("Top LOLA Enrichments") + 
		ggplot2::xlab("Database set") + 
		ggplot2::ylab("Odds ratio") + ggplot2::theme_classic()
	return(g)
}
nsheff/LOLA documentation built on Sept. 8, 2020, 11:50 a.m.