#' Downstream analysis based on MAGeCK-MLE result
#'
#' Integrative analysis pipeline using the gene summary table in MAGeCK MLE results
#'
#' @docType methods
#' @name FluteMLE
#' @rdname FluteMLE
#' @aliases flutemle
#'
#' @param gene_summary A data frame or a file path to gene summary file generated by MAGeCK-MLE.
#' @param treatname A character vector, specifying the names of treatment samples.
#' @param ctrlname A character vector, specifying the names of control samples.
#' If there is no controls in your CRISPR screen, you can specify "Depmap" as ctrlname and set `incorporateDepmap=TRUE`.
#' @param keytype "Entrez" or "Symbol".
#' @param organism "hsa" or "mmu".
#' @param incorporateDepmap Boolean, indicating whether incorporate Depmap data into analysis.
#' @param cell_lines A character vector, specifying the cell lines in Depmap to be considered.
#' @param lineages A character vector, specifying the lineages in Depmap to be considered.
#' @param norm_method One of "none", "cell_cycle" (default) or "loess".
#' @param posControl A character vector, specifying a list of positive control gene symbols.
#' @param omitEssential Boolean, indicating whether omit common essential genes from the downstream analysis.
#' @param top An integer, specifying the number of top selected genes to be labeled
#' in rank figure and the number of top pathways to be shown.
#' @param toplabels A character vector, specifying interested genes to be labeled in rank figure.
#' @param scale_cutoff Boolean or numeric, specifying how many standard deviation will be used as cutoff.
#' @param limit A two-length vector, specifying the minimal and maximal size of gene sets for enrichent analysis.
#' @param enrich_method One of "ORT"(Over-Representing Test) and "HGT"(HyperGemetric test).
#' @param proj A character, indicating the prefix of output file name, which can't contain special characters.
#' @param width The width of summary pdf in inches.
#' @param height The height of summary pdf in inches.
#' @param outdir Output directory on disk.
#' @param pathview.top Integer, specifying the number of pathways for pathview visualization.
#' @param verbose Boolean
#'
#' @author Wubing Zhang
#'
#' @return All of the pipeline results is output into the \code{out.dir}/MAGeCKFlute_\code{proj},
#' which includes a pdf file and many folders. The pdf file 'FluteMLE_\code{proj}_\code{norm_method}.pdf' is the
#' summary of pipeline results. For each section in this pipeline, figures and useful data are
#' outputed to corresponding subfolders.
#' \itemize{
#' \item {QC}: {Quality control}
#' \item {Selection}: {Positive selection and negative selection.}
#' \item {Enrichment}: {Enrichment analysis for positive and negative selection genes.}
#' \item {PathwayView}: {Pathway view for top enriched pathways.}
#' }
#'
#' @details MAGeCK-MLE can be used to analyze screen data from multi-conditioned experiments. MAGeCK-MLE
#' also normalizes the data across multiple samples, making them comparable to each other. The most important
#' ouput of MAGeCK MLE is `gene_summary` file, which includes the beta scores of multiple conditions and
#' the associated statistics. The `beta score` for each gene describes how the gene is selected: a positive
#' beta score indicates a positive selection, and a negative beta score indicates a negative selection.
#'
#' The downstream analysis includes identifying essential, non-essential, and target-associated genes,
#' and performing biological functional category analysis and pathway enrichment analysis of these genes.
#' The function also visualizes genes in the context of pathways to benefit users exploring screening data.
#'
#'
#' @seealso \code{\link{FluteRRA}}
#'
#' @examples
#' file3 = file.path(system.file("extdata", package = "MAGeCKFlute"),
#' "testdata/mle.gene_summary.txt")
#' \dontrun{
#' # functional analysis for MAGeCK MLE results
#' FluteMLE(file3, treatname = "Pmel1", ctrlname = "Pmel1_Ctrl", proj = "Pmel1")
#' }
#'
#' @import ggplot2 stats grDevices utils gridExtra grid
#' @export
FluteMLE <- function(gene_summary, treatname, ctrlname = "Depmap",
keytype = "Symbol", organism = "hsa", # Input dataset
incorporateDepmap = FALSE,
cell_lines = NA, lineages = "All",
norm_method = "cell_cycle", posControl = NULL,
omitEssential = TRUE,
top = 10, toplabels = NA,
scale_cutoff = 2, limit = c(0,200),
enrich_method = "ORT", proj = NA,
width = 10, height = 7, outdir = ".",
pathview.top = 4, verbose = TRUE){
## Prepare the running environment ##
{
message(Sys.time(), " # Create output dir and pdf file...")
outdir = file.path(outdir, paste0("MAGeCKFlute_", proj))
dir.create(file.path(outdir), showWarnings = FALSE)
output_pdf = paste0("FluteMLE_", proj, "_", norm_method, ".pdf")
pdf(file.path(outdir, output_pdf), width = width, height = height)
}
## Beta Score Preparation ##
{
beta = ReadBeta(gene_summary)
if(tolower(keytype)!="entrez"){
beta$EntrezID = TransGeneID(beta$Gene, keytype, "Entrez", organism = organism)
}else{
beta$EntrezID = beta$Gene
}
if(tolower(keytype)!="symbol"){
beta$Symbol = TransGeneID(beta$Gene, keytype, "Symbol", organism = organism)
}else{
beta$Symbol = beta$Gene
}
if(organism != "hsa"){
beta$HumanGene = TransGeneID(beta$Symbol, "Symbol", "Symbol",
fromOrg = organism, toOrg = "hsa")
beta$GeneID = TransGeneID(beta$HumanGene, "Symbol", "Entrez")
}else{
beta$HumanGene = beta$Symbol
beta$GeneID = beta$EntrezID
}
message(Sys.time(), " # Transform id to official human gene name ...")
idx1 = is.na(beta$EntrezID)
idx2 = !is.na(beta$EntrezID) & duplicated(beta$EntrezID)
idx = idx1|idx2
if(sum(idx1)>0) message(sum(idx1), " genes fail to convert into Entrez IDs: ",
paste0(beta$Gene[idx1], collapse = ", "))
if(sum(idx2)>0) message(sum(idx2), " genes have duplicate Entrez IDs: ",
paste0(beta$Gene[idx2], collapse = ", "))
dd = beta[!idx, ]
if(incorporateDepmap | "Depmap"%in%ctrlname)
dd = IncorporateDepmap(dd, symbol = "HumanGene", cell_lines = cell_lines,
lineages = lineages)
if(!all(c(ctrlname, treatname) %in% colnames(dd)))
stop("Sample name doesn't match !!!")
## Normalization
if(tolower(norm_method)=="cell_cycle")
dd = NormalizeBeta(dd, id = "HumanGene", samples = c(ctrlname, treatname),
method = "cell_cycle", posControl = posControl)
if(tolower(norm_method)=="loess")
dd = NormalizeBeta(dd, id = "HumanGene", samples = c(ctrlname, treatname), method = "loess")
}
## Distribution of beta scores ##
{
## All genes ##
outputDir1 = file.path(outdir, "QC/")
dir.create(outputDir1, showWarnings = FALSE)
idx_distr = c(ctrlname, treatname)
P1 = ViolinView(dd[, idx_distr], ylab = "Beta score", main = "All genes",
filename = paste0(outputDir1, "ViolinView_all_", norm_method, ".png"))
P2 = DensityView(dd[, idx_distr], xlab = "Beta score", main = "All genes",
filename = paste0(outputDir1, "DensityView_all_", norm_method, ".png"))
P3 = ConsistencyView(dd, ctrlname, treatname, main = "All genes",
filename = paste0(outputDir1, "Consistency_all_", norm_method, ".png"))
P4 = MAView(dd, ctrlname, treatname, main = "All genes",
filename = paste0(outputDir1, "MAView_all_", norm_method, ".png"))
gridExtra::grid.arrange(P1, P2, P3, P4, ncol = 2)
## Essential genes ##
Zuber_Essential = readRDS(file.path(system.file("extdata", package = "MAGeCKFlute"),
"Zuber_Essential.rds"))
if(is.null(posControl))
idx = toupper(dd$HumanGene) %in% toupper(Zuber_Essential$GeneSymbol)
else
idx = toupper(dd$Gene) %in% toupper(posControl)
if(sum(idx) > 10){
P1 = ViolinView(dd[idx, idx_distr], ylab = "Essential.B.S.", main = "Essential genes",
filename = paste0(outputDir1, "ViolinView_posctrl_", norm_method, ".png"))
P2 = DensityView(dd[idx, idx_distr], xlab = "Essential.B.S.", main = "Essential genes",
filename = paste0(outputDir1, "DensityView_posctrl_", norm_method, ".png"))
P3 = ConsistencyView(dd[idx, ], ctrlname, treatname, main = "Essential genes",
filename = paste0(outputDir1, "Consistency_posctrl_", norm_method, ".png"))
P4 = MAView(dd[idx, ], ctrlname, treatname, main = "Essential genes",
filename = paste0(outputDir1, "MAView_posctrl_", norm_method, ".png"))
gridExtra::grid.arrange(P1, P2, P3, P4, ncol = 2)
}
}
## Combine replicates ##
{
dd$Control = Biobase::rowMedians(as.matrix(dd[, ctrlname, drop = FALSE]))
dd$Treatment = Biobase::rowMedians(as.matrix(dd[, treatname, drop = FALSE]))
dd$Diff = dd$Treatment - dd$Control
write.table(dd, paste0(outputDir1, proj, "_processed_data.txt"),
sep = "\t", row.names = FALSE, quote = FALSE)
x_cut = c(-CutoffCalling(dd$Control, scale_cutoff),
CutoffCalling(dd$Control, scale_cutoff))
y_cut = c(-CutoffCalling(dd$Treatment, scale_cutoff),
CutoffCalling(dd$Treatment, scale_cutoff))
intercept = c(-CutoffCalling(dd$Diff, scale_cutoff),
CutoffCalling(dd$Diff, scale_cutoff))
if(omitEssential){
dd = OmitCommonEssential(dd, symbol = "HumanGene", dependency = -0.4)
write.table(dd, paste0(outputDir1, proj, "_omitessential_data.txt"),
sep = "\t", row.names = FALSE, quote = FALSE)
}
dd$Rank = rank(dd$Diff)
dd$RandomIndex = sample(1:nrow(dd), nrow(dd))
dd$Gene = dd$Symbol
}
## Drug-targeted genes ##
{
outputDir2 = file.path(outdir, "Selection/")
dir.create(outputDir2, showWarnings = FALSE)
p1 = ScatterView(dd, "Control", "Treatment", groups = c("top", "bottom"),
groupnames = c("GroupA", "GroupB"), intercept = intercept)
ggsave(paste0(outputDir2, "ScatterView_TreatvsCtrl_", norm_method, ".png"),
p1, width = 4, height = 3)
write.table(p1$data, paste0(outputDir2, "Data_ScatterView_TreatvsCtrl.txt"),
sep = "\t", row.names = FALSE, quote = FALSE)
p2 = ScatterView(dd, x = "Rank", y = "Diff", label = "Symbol",
groups = c("top", "bottom"), groupnames = c("GroupA", "GroupB"),
top = top, y_cut = y_cut)
ggsave(paste0(outputDir2, "RankView_Treat-Ctrl_", norm_method, ".png"),
p2, width = 3, height = 5)
p3 = ScatterView(dd[dd$Diff>0, ], x = "RandomIndex", y = "Diff", label = "Symbol",
y_cut = y_cut, groups = "top", groupnames = c("GroupA"), top = top)
ggsave(paste0(outputDir2, "ScatterView_Treat-Ctrl_Positive_", norm_method, ".png"),
p3, width = 4, height = 3)
p4 = ScatterView(dd[dd$Diff<0, ], x = "RandomIndex", y = "Diff", label = "Symbol",
y_cut = y_cut, groups = "bottom", groupnames = c("GroupB"), top = top)
ggsave(paste0(outputDir2, "ScatterView_Treat-Ctrl_Negative_", norm_method, ".png"),
p4, width = 4, height = 3)
gridExtra::grid.arrange(p1, p2, p3, p4, ncol = 2)
}
## Enrichment analysis of negative and positive selected genes ##
{
outputDir3 = file.path(outdir, "Enrichment/")
outputDir4 = file.path(outdir, "PathwayView/")
dir.create(outputDir3, showWarnings=FALSE)
dir.create(outputDir4, showWarnings=FALSE)
E1 = EnrichAB(p1$data, enrich_method = enrich_method,
organism = organism, limit = limit, top = top,
filename = norm_method, out.dir = outputDir3)
# EnrichedView
gridExtra::grid.arrange(E1$keggA$gridPlot, E1$reactomeA$gridPlot, E1$gobpA$gridPlot, E1$complexA$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$keggB$gridPlot, E1$reactomeB$gridPlot, E1$gobpB$gridPlot, E1$complexB$gridPlot, ncol = 2)
# Pathway view for top 4 pathway
if(!is.null(E1$keggA$enrichRes) && nrow(E1$keggA$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$keggA$enrichRes$ID),
top = pathview.top, ncol = 2, title="Group A",
organism=organism, output=outputDir4)
if(!is.null(E1$keggB$enrichRes) && nrow(E1$keggB$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$keggB$enrichRes$ID),
top = pathview.top, ncol = 2,
title="Group B", organism=organism,
output=outputDir4)
}
## Nine-squares ##
{
p1 = ScatterView(dd, x = "Control", y = "Treatment", label = "Symbol",
groups = c("midleft", "topcenter", "midright", "bottomcenter"),
groupnames = c("Group1", "Group2", "Group3", "Group4"),
top = top, display_cut = TRUE,
x_cut = x_cut, y_cut = y_cut, intercept = intercept)
ggsave(paste0(outputDir2, "SquareView_", norm_method, ".png"), p1, width = 5, height = 4)
write.table(p1$data, paste0(outputDir2, proj, "squareview_data.txt"),
sep = "\t", row.names = FALSE, quote = FALSE)
gridExtra::grid.arrange(p1, ncol = 1)
}
## Nine-Square grouped gene enrichment ##
{
E1 = EnrichSquare(p1$data, id = "GeneID", keytype = "entrez",
x = "Control", y = "Treatment", top = top,
enrich_method = enrich_method, limit = limit,
filename=norm_method, out.dir=outputDir3)
# EnrichView
gridExtra::grid.arrange(E1$kegg1$gridPlot, E1$reactome1$gridPlot, E1$gobp1$gridPlot, E1$complex1$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg2$gridPlot, E1$reactome2$gridPlot, E1$gobp2$gridPlot, E1$complex2$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg3$gridPlot, E1$reactome3$gridPlot, E1$gobp3$gridPlot, E1$complex3$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg4$gridPlot, E1$reactome4$gridPlot, E1$gobp4$gridPlot, E1$complex4$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg12$gridPlot, E1$reactome12$gridPlot, E1$gobp12$gridPlot, E1$complex12$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg13$gridPlot, E1$reactome13$gridPlot, E1$gobp13$gridPlot, E1$complex13$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg24$gridPlot, E1$reactome24$gridPlot, E1$gobp24$gridPlot, E1$complex24$gridPlot, ncol = 2)
gridExtra::grid.arrange(E1$kegg34$gridPlot, E1$reactome34$gridPlot, E1$gobp34$gridPlot, E1$complex34$gridPlot, ncol = 2)
# PathwayView
if(!is.null(E1$kegg1$enrichRes) && nrow(E1$kegg1$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg1$enrichRes$ID), ncol = 2,
top = pathview.top, title = "Group 1", organism=organism,
output=outputDir4)
if(!is.null(E1$kegg2$enrichRes) && nrow(E1$kegg2$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg2$enrichRes$ID), ncol = 2,
top = pathview.top, title = "Group 2",
organism=organism,output=outputDir4)
if(!is.null(E1$kegg3$enrichRes) && nrow(E1$kegg3$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg3$enrichRes$ID), ncol = 2,
top = pathview.top, title = "Group 3",
organism=organism, output=outputDir4)
if(!is.null(E1$kegg4$enrichRes) && nrow(E1$kegg4$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg4$enrichRes$ID), ncol = 2,
title = "Group 4", organism = organism,
top = pathview.top, output=outputDir4)
if(!is.null(E1$kegg12$enrichRes) && nrow(E1$kegg12$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg12$enrichRes$ID), ncol = 2,
title = "Group 1 & Group 2", organism=organism,
top = pathview.top, output=outputDir4)
if(!is.null(E1$kegg13$enrichRes) && nrow(E1$kegg13$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg13$enrichRes$ID), ncol = 2,
title = "Group 1 & Group 3", organism=organism,
top = pathview.top, output=outputDir4)
if(!is.null(E1$kegg24$enrichRes) && nrow(E1$kegg24$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg24$enrichRes$ID), ncol = 2,
title = "Group 2 & Group 4", organism=organism,
top = pathview.top, output=outputDir4)
if(!is.null(E1$kegg34$enrichRes) && nrow(E1$kegg34$enrichRes)>0)
arrangePathview(dd, gsub("KEGG_", "", E1$kegg34$enrichRes$ID), ncol = 2,
title = "Group 3 & Group 4", organism=organism,
top = pathview.top, output=outputDir4)
}
dev.off()
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.