#################
## rankGenes
#################
.checkParamsRankGenes <- function(sceObject, simMed){
## Check if the normalized matrix is correct
if(all(dim(sceObject) == c(0,0)))
stop("The 'scRNAseq' object that you're using with 'rankGenes' ",
"function doesn't have its 'SceNorm' slot updated. Please",
" use 'normaliseCountMatrix' on the object before.")
## Check if the SCE object contain cluster colums in its colData
if(!("clusters" %in% names(colData(sceObject))))
stop("The 'scRNAseq' object that you're using with 'rankGenes' ",
"function doesn't have a correct 'SceNorm' slot. This slot",
" should be a 'SingleCellExperiment' object containing ",
"'clusters' column in its colData. Please check if you ",
"correctly used 'clusterCellsInternal' on the object.")
## Check the cluster similarity matrix
if(all(dim(simMed) == c(0,0)) || all(dim(simMed) == c(1,1)))
stop("The 'scRNAseq' object that you're using with 'rankGenes' ",
"function doesn't have its 'clustersSimilarityMatrix' slot",
" updated. Please use 'clusterCellsInternal' on the object",
" before.")
}
#' .buildTTestFDR
#'
#' @description
#' Adjust p-values according to the FDR method.
#'
#' @param mat Normalized expression matrix.
#' @param allGroups Character vector of the different clusters names.
#' @param currentGroup cluster name currently processed in the mapply.
#' @param tTestPval A list of p-values of a cluster comparing it to all the
#' other clusters.
#' @param currentSimMed Current column name of the cluster similarity matrix
#' processed in the mapply.
#' @param simMedNames Column names of the cluster similarity matrix.
#' @keywords internal
#' @return A list containing for each cluster the adjustted p-value of the
#' t-test, the mean log 10 FDR, and the score.
#' @noRd
.buildTTestFDR <- function(mat, allGroups, currentGroup, tTestPval,
currentSimMed, simMedNames){
tTestFDR <- data.frame(row.names=rownames(mat))
otherGroups <- allGroups[allGroups!=currentGroup]
tTestFDR <- lapply(otherGroups, function(currentOther){
tTestFDR[, paste0("vs_", currentOther)] <-
p.adjust(tTestPval[, paste0("vs_",
currentOther)],
method="fdr")
return(tTestFDR)
})
tTestFDR <- dplyr::bind_cols(tTestFDR)
tTestFDR <- cbind(Gene=as.factor(rownames(mat)), tTestFDR)
submat <- as.matrix(tTestFDR[, 2:(length(otherGroups) + 1 )])
## Add column mean_log10_fdr
tTestFDR$mean_log10_fdr <- rowMeans(log10(submat + 1e-300))
## Add column n_05
tTestFDR$n_05 <- apply(submat, 1, function(x)
length(x[!is.na(x) & x < 0.05]))
##Add column score
weights <- currentSimMed[match(otherGroups, simMedNames)]
tTestFDR$score <- apply(submat, 1, function(x)
sum(-log10(x+1e-300) * weights) / ncol(submat))
tTestFDR <- tTestFDR[order(tTestFDR$score, decreasing=TRUE), ]
row.names(tTestFDR) <- NULL
return(tTestFDR)
}
#' .buildTTestPval
#'
#' @description
#' Generate a list of p-values of a cluster comparing it to all the other
#' clusters.
#'
#' @param otherGroups List of marker genes for the other groups.
#' @param tTestPval An empty data frame to retrieve the results.
#' @param colDF Data frame of Metadata of the scRNA-Seq object.
#' @param mat Normalized expression matrix.
#' @param colLabel Name of the column with a clustering result.
#' Default="clusters"
#' @param currentGroup Marker genes for the cluster currently considered in the
#' mapply of .buildMarkerGenesList.
#'
#' @keywords internal
#' @return A list containing for each cluster the p-value of the t-test.
#' @noRd
.buildTTestPval <- function(otherGroups, tTestPval, colDF, mat, colLabel,
currentGroup){
tTestPval <- data.frame(row.names=rownames(mat))
tTestPval <- lapply(otherGroups, function(currentOther){
tTestPval[, paste0("vs_", currentOther)] <- NA
x <- as.matrix(mat[, colDF[, c(colLabel)] == currentGroup])
y <- as.matrix(mat[, colDF[, c(colLabel)] == currentOther])
t <- (rowMeans(x) - rowMeans(y)) / sqrt(apply(mat, 1,
var) * (1 / ncol(x) + 1 / ncol(y)))
df <- ncol(x) + ncol(y) - 2
tTestPval[, paste0("vs_", currentOther)] <- pt(t,
df, lower.tail=FALSE)
return(tTestPval)
})
tTestPval <- dplyr::bind_cols(tTestPval)
tTestPval <- cbind(Gene=rownames(mat), tTestPval)
return(tTestPval)
}
#' .buildMarkerGenesList
#'
#' @description
#' Generate genes list for each cluster.
#'
#' @param theObject An Object of class scRNASeq for which the count matrix was
#' normalized (see ?normaliseCountMatrix), tSNE were calculated (see
#' ?generateTSNECoordinates), dbScan was run (see ?runDBSCAN), cells were
#' clustered (see ?clusterCellsInternal), as clusters themselves (see
#' ?calculateClustersSimilarity).
#' @param groups Character vector of the different clusters names.
#' @param simMedRowList List of similarity values per cluster.
#' @param exprM Normalized expression matrix.
#' @param column Name of the column with a clustering result. Default="clusters"
#' @param colNamesVec Column names of the cluster similarity matrix.
#' @param colDF Data frame of Metadata of the scRNA-Seq object.
#' @param writeMarkerGenes If TRUE, output one list of marker genes per cluster
#' in the output directory defined in theObject and in the sub-directory
#' 'marker_genes'. Default=FALSE.
#'
#' @keywords internal
#' @return A list containing for each cluster the marker genes.
#' @importFrom utils write.table
#' @importFrom stats p.adjust
#' @importFrom stats var
#' @importFrom stats pt
#' @noRd
.buildMarkerGenesList <- function(theObject, groups, simMedRowList, exprM,
column, colNamesVec, colDF, writeMarkerGenes){
result <- mapply(function(currentGroup, currentSimMed, mat,
allGroups, colLabel, simMedNames){
msg <- paste("Working on cluster", currentGroup)
message(msg)
## Create a dataframe clustering vs clustering
otherGroups <- allGroups[allGroups!=currentGroup]
tTestPval <- .buildTTestPval(otherGroups, tTestPval, colDF, mat,
colLabel, currentGroup)
## Apply the FDR
tTestFDR <- .buildTTestFDR(mat, allGroups, currentGroup,
tTestPval, currentSimMed, simMedNames)
## Write list if option = TRUE
if(writeMarkerGenes){
outputDir <- file.path(getOutputDirectory(theObject),
"marker_genes")
outputFile <- paste0(getExperimentName(theObject),
"_cluster_", currentGroup, "_genes.tsv")
write.table(tTestFDR, file.path(outputDir, outputFile),
col.names=TRUE, row.names=FALSE, quote=FALSE,
sep="\t")
}
return(tTestFDR)
}, groups, simMedRowList,
MoreArgs = list(exprM, groups, column, colNamesVec),
SIMPLIFY = FALSE)
return(result)
}
#' rankGenes
#'
#' @description
#' This function searches marker genes for each cluster.
#'
#' @usage
#' rankGenes(theObject, column="clusters", writeMarkerGenes=FALSE)
#'
#' @param theObject An Object of class scRNASeq for which the count matrix was
#' normalized (see ?normaliseCountMatrix), tSNE were calculated (see
#' ?generateTSNECoordinates), dbScan was run (see ?runDBSCAN), cells were
#' clustered (see ?clusterCellsInternal), as clusters themselves (see
#' ?calculateClustersSimilarity).
#' @param column Name of the column with a clustering result.
#' Default="clusters"
#' @param writeMarkerGenes If TRUE, output one list of marker genes per cluster
#' in the output directory defined in theObject and in the sub-directory
#' 'marker_genes'. Default=FALSE.
#'
#' @details
#' To understand the nature of the consensus clusters identified by CONCLUS,
#' it is essential to identify genes which could be classified as marker genes
#' for each cluster. To this aim, each gene should be "associated" to a
#' particular cluster. This association is performed by looking at upregulated
#' genes in a particular cluster compared to the others (multiple comparisons).
#'
#' The function rankGenes performs multiple comparisons of all genes from
#' theObject and rank them according to a score reflecting a FDR power.
#'
#' For each table corresponding to a particular consensus cluster, the first
#' column is a gene name. The following columns represent adjusted p-values
#' (FDR) of a one-tailed T-test between the considered cluster and all others.
#'
#' Top genes with significant FDR in most of the comparisons can be assumed as
#' positive markers of a cluster. The column mean_log10_fdr is the mean power of
#' FDR in all comparisons; the column n_05 is the number of comparisons in
#' which the gene was significantly upregulated. The score for marker genes is
#' the average power of FDR among all comparisons for a cluster multiplied to
#' weights taken from the clustersSimilarityMatrix + 0.05. Taking into account
#' both FDRs of all comparisons and clustersSimilarityMatrix allows us to keep
#' the balance between highlighting markers for individual clusters and their
#' 'families' which makes the final heatmap as informative as possible.
#'
#' Note: Adding 0.05 to the clustersSimilarityMatrix in calculating the score
#' helps avoiding the following problem: in case you have a cluster very
#' different from all others, it will have the value 1 on the diagonal and 0
#' similarities to all others groups in the clustersSimilarityMatrix. So all
#' weights for that cluster will be zeros meaning that the score would also be
#' zero and genes will be ordered in alphabetical order in the corresponding
#' marker genes list file.
#'
#' For a cluster k and a gene G, a scoreG was defined in the following way:
#'
#' scoreG= sum((-log10(fdrk, i + epsilon)*weightk,i) / nClusters-1)
#'
#' Where
#'
#' 1. fdrk,i is an adjusted p-value obtained by comparing expression of G in
#' cluster k versus expression of G in cluster i. \cr
#' 2. weightk,i is a similarity between these two groups taken from the element
#' in the clustersSimilarityMatrix. \cr
#' 3. nClusters is a number of consensus clusters given to the rankGenes(). \cr
#' 4. epsilon = 10-300 is a small number which does not influence the ranking
#' and added to avoid an error when fdr is equal to zero. \cr
#' 5. k = [1,…,nClusters]. \cr
#' 6. I = ([1,…,nClusters]exceptfor[k]). \cr
#'
#' @aliases rankGenes
#'
#' @author
#' Ilyess RACHEDI, based on code by Polina PAVLOVICH and Nicolas DESCOSTES.
#'
#' @rdname
#' rankGenes-scRNAseq
#'
#' @return
#' An object of class scRNASeq with its markerGenesList slot updated.
#'
#' @examples
#' ## Object scr containing the results of previous steps
#' load(system.file("extdata/scrFull.Rdat", package="conclus"))
#'
#' ## Ranking genes
#' scr <- rankGenes(scr)
#'
#' @seealso
#' retrieveTopClustersMarkers retrieveGenesInfo
#'
#' @exportMethod rankGenes
#' @importFrom SummarizedExperiment colData
#' @importFrom Biobase exprs
setMethod(
f = "rankGenes",
signature = "scRNAseq",
definition = function(theObject, column="clusters",
writeMarkerGenes=FALSE){
sceObject <- getSceNorm(theObject)
simMed <- getClustersSimilarityMatrix(theObject)
.checkParamsRankGenes(sceObject, simMed)
exprM <- Biobase::exprs(sceObject)
colDF <- SummarizedExperiment::colData(sceObject)
message("Ranking marker genes for each cluster.")
stopifnot(all(colnames(exprM) == rownames(colDF)))
groups <- unique(colDF[, column])
simMed <- simMed + 0.05
simMedRowList <- split(simMed, seq_len(nrow(simMed)))
colNamesVec <- as.numeric(colnames(simMed))
markerGenesList <- .buildMarkerGenesList(theObject, groups,
simMedRowList, exprM, column, colNamesVec, colDF,
writeMarkerGenes)
setMarkerGenesList(theObject) <- markerGenesList
rm(markerGenesList, simMed, groups)
return(theObject)
})
###################
## retrieveGenesInfo
###################
#' .saveGenesInfo
#'
#' @param theObject An Object of class scRNASeq for which retrieveGenesInfo was
#' run. See ?retrieveGenesInfo.
#'
#' @keywords internal
#' @noRd
#'
#' @importFrom utils write.table
.saveGenesInfo <- function(theObject){
outputDir <- file.path(getOutputDirectory(theObject),
"marker_genes/saveGenesInfo")
if(!file.exists(outputDir))
dir.create(outputDir, recursive=TRUE)
infos <- getGenesInfos(theObject)
infosList <- split(infos, infos$clusters)
invisible(lapply(infosList, function(clusterDF){
clustNB <- unique(clusterDF$clusters)
if(!isTRUE(all.equal(length(clustNB), 1)))
stop("Error in saveGenesInfo. Contact the",
" developper.")
outputFile <- file.path(outputDir,
paste0("genes_info_clust", clustNB,
".csv"))
write.csv(clusterDF, file=outputFile,
row.names=FALSE)
}))
}
.checkParamsretrieveGenesInfo <- function(theObject, orderGenes, genes, species,
saveInfos){
if(!isTRUE(all.equal(orderGenes,"initial")) &&
!isTRUE(all.equal(orderGenes,"alphabetical")))
stop("orderGenes should be 'initial' or 'alphabetical'.")
## Check if the normalized matrix is correct
sceObject <- getSceNorm(theObject)
if(all(dim(sceObject) == c(0,0)))
stop("The 'scRNAseq' object that you're using with 'retrieveGenesInfo'",
" function doesn't have its 'SceNorm' slot updated. Please use",
" 'normaliseCountMatrix' on the object before.")
## Check if the SCE object contain cluster colums in its colData
if(!("clusters" %in% names(colData(sceObject))))
stop("The 'scRNAseq' object that you're using with 'retrieveGenesInfo'",
" function doesn't have a correct 'SceNorm' slot. This slot",
" should be a 'SingleCellExperiment' object containing ",
"'clusters' column in its colData. Please check if you ",
"correctly used 'clusterCellsInternal' on the object.")
## Check the cluster similarity matrix
simMed <- getClustersSimilarityMatrix(theObject)
if(all(dim(simMed) == c(0,0)) || all(dim(simMed) == c(1,1)))
stop("The 'scRNAseq' object that you're using with 'retrieveGenesInfo'",
" function doesn't have a similarity matrix, Please use ",
"'calculateClustersSimilarity' on the object before.")
## Check the marker genes
if(isTRUE(all.equal(dim(genes), c(1,2))) &&
isTRUE(all.equal(genes$geneName, "gene1")) &&
is.na(genes$clusters))
stop("The 'scRNAseq' object that you're using with 'retrieveGenesInfo'",
" does not have marker genes. Please use ",
"'retrieveTopClustersMarkers' before.")
## Check saveInfos
if(!is.logical(saveInfos))
stop("saveInfos should be a boolean.")
}
.returnDB1 <- function(genes, ensembl){
attributes <- c("external_gene_name", # Complete name,
"uniprot_gn_symbol", # geneName
"description", # Description
"entrezgene_description",
"gene_biotype", # Feature.Type
"go_id" ) # Gene Ontology
values <- genes$geneName
filters <- "external_gene_name"
db1 <- .tryGetBM(attributes=attributes, ensembl=ensembl, values=values,
filters=filters)
return(db1)
}
.returnDB2 <- function(genes, ensembl){
attributes <- c("external_gene_name", # Complete name,
"uniprot_gn_symbol", # geneName
"chromosome_name", # Chromosome name
"ensembl_gene_id", # Ensembl
"entrezgene_id", #Entrez.Gene.ID
"uniprot_gn_id") # Uniprot.ID
values <- genes$geneName
filters <- "external_gene_name"
db2 <- .tryGetBM(attributes=attributes, ensembl=ensembl, values=values,
filters=filters)
return(db2)
}
#' .queryBiomart
#'
#' @description
#' Main sub-function that retrieves the uniprot gene symbol, the chromosome
#' name, the entrez gene description, the go id, the uniprot id, the external
#' gene name, the gene biotype, the ensembl id, and the entrez gene id from
#' biomaRt.
#'
#' @param genes Markers genes retrieved from the submitted object with
#' ?getTopMarkers.
#' @param ensembl An instance of biomaRt obtained with ?useEnsembl.
#'
#' @keywords internal
#' @noRd
#' @importFrom biomaRt getBM
#' @return Return the information described above..
.queryBiomart <- function(genes, ensembl){
database <- merge(x = .returnDB1(genes, ensembl),
y = .returnDB2(genes, ensembl),
by = c("external_gene_name", "uniprot_gn_symbol"),
all.x = TRUE)
## Group the GO id and the uniprot Id
options(dplyr.summarise.inform = FALSE)
database <- database %>% group_by(external_gene_name, uniprot_gn_symbol,
chromosome_name, entrezgene_description) %>%
summarise(go_id = gsub("^, ", "",
(x=paste(unique(go_id), collapse=', '))),
uniprot_gn_id = paste(unique(uniprot_gn_id),
collapse=', '),
description = paste(unique(description),
collapse=', '),
gene_biotype = paste(unique(gene_biotype),
collapse=', '),
ensembl_gene_id = paste(unique(ensembl_gene_id),
collapse=', '),
entrezgene_id = paste(unique(entrezgene_id),
collapse=', '),
)
return(database)
}
#' .groupGOandUniprotID
#'
#' @description
#' Retrieves the uniprot symbol, the external gene name, and the mgi information
#' from biomaRt.
#'
#' @param species Name of the species defined in the submitted object and
#' retrieved with ?getSpecies. Current values allowed are mouse and human.
#' Other organisms can be added on demand.
#' @param genes Markers genes retrieved from the submitted object with
#' ?getTopMarkers.
#' @param speDbID NULL if species is human and 'mgi_id' if species is mouse.
#' @param speDBdescription NULL if species is human and 'mgi_description' if
#' species is mouse.
#' @param ensembl An instance of biomaRt obtained with ?useEnsembl.
#' @keywords internal
#' @noRd
#' @importFrom biomaRt getBM
#' @return Return the uniprot gene symbol, the external gene name, and the
#' mgi information if the species is mouse.
.groupGOandUniprotID <- function(speDbID, speDBdescription, genes, ensembl){
if(!(is.null(speDbID) & is.null(speDBdescription)))
## Query biomart for specific db according to species
## external_gene_name is also searched because one gene can have
## several external_gene_name and so several speDbID and
## speDBdescription
spDB <- getBM(attributes=c("external_gene_name", "uniprot_gn_symbol",
speDbID,
speDBdescription),
values=genes$geneName,
filters = "external_gene_name",
mart=ensembl)
return(spDB)
}
#' .defineDatabase
#'
#' @description
#' Core functions that aims at building a database with all information about
#' the markers. It first selects the biomaRt instance to use according to the
#' defined species; It then queries biomaRt and formats the results.
#'
#' @param species Name of the species defined in the submitted object and
#' retrieved with ?getSpecies. Current values allowed are mouse and human.
#' Other organisms can be added on demand.
#' @param genes Markers genes retrieved from the submitted object with
#' ?getTopMarkers.
#' @param speDbID NULL if species is human and 'mgi_id' if species is mouse.
#' @param speDBdescription NULL if species is human and 'mgi_description' if
#' species is mouse.
#'
#' @keywords internal
#' @noRd
#' @importFrom biomaRt useEnsembl
#' @return A database with biomaRt information.
.defineDatabase <- function(species, genes, speDbID, speDBdescription){
## Selecting the BioMart database to use
databaseDict <- c(mouse = "mmusculus_gene_ensembl",
human = "hsapiens_gene_ensembl")
dataset <- databaseDict[species]
ensembl <- .tryUseMart(biomart="ensembl", dataset)
## Query biomart
database <- .queryBiomart(genes, ensembl)
## Group the GO id and the uniprot Id
spDB <- .groupGOandUniprotID(speDbID, speDBdescription, genes, ensembl)
## Merge the second column, the first already existing in the first table
if(!is.null(spDB))
database <- merge(x = database, all.x = TRUE, y = spDB, by =
c("external_gene_name", "uniprot_gn_symbol"))
## Add cluster column
database <- merge(x = database, y = genes,
by.x = "external_gene_name", by.y = "geneName", all.x = TRUE)
## Add Symbol column
database <- cbind(database, Symbol = database$external_gene_name)
return(database)
}
#' .orderCol
#'
#' @description
#' Re-organize the columns of the table (database) containing the biomaRt
#' retrieved information.
#'
#' @param speDBdescription NULL if species is human and 'mgi_description' if
#' species is mouse.
#' @param speDbID NULL if species is human and 'mgi_id' if species is mouse.
#' @param database Table containing the biomaRt retrieved information.
#' @param orderGenes If "initial" then the order of genes will not be changed.
#' The other option is "alphabetical". Default = "initial".
#' @param genes Markers genes retrieved from the submitted object with
#' ?getTopMarkers.
#'
#' @keywords internal
#' @noRd
#'
#' @return The re-ordered database if orderGenes is set to 'initial'.
.orderCol <- function(speDBdescription, speDbID, database, orderGenes, genes){
colnamesOrder <- c("clusters", "Symbol", "gene_biotype",
"entrezgene_description", speDBdescription,
"chromosome_name",
"ensembl_gene_id", speDbID, "entrezgene_id",
"uniprot_gn_id", "uniprot_gn_symbol", "go_id")
result <- database[, colnamesOrder]
if(isTRUE(all.equal(orderGenes, "initial"))){
desiredOrder <- genes$geneName
result <- merge(list(Symbol = unique(desiredOrder)),
result, sort = FALSE)
rownames(result) <- seq_len(nrow(result))
}
return(result)
}
#' .displayInfoMarkers
#'
#' @description
#' This function displays the info got by retrieveGenesInfo
#' @param InfoMarkers Information about markers get by retrieveGenesInfo
#' @keywords internal
#' @importFrom DT datatable
#' @noRd
.displayInfoMarkers <- function(InfoMarkers){
print(DT::datatable(InfoMarkers, class = 'nowrap hover stripe',
options = list(searching = FALSE, bLengthChange = FALSE, scrollX=TRUE,
columnDefs = list(list(className = 'dt-center',
targets=c(seq(3), seq(8,9))),
list(className="dt-left", targets=seq(4,11))))))
}
#' retrieveGenesInfo
#'
#' @description
#' This method retrieve information about the marker genes of each cluster
#' querying the Ensembl database with biomaRt and display the result.
#'
#' @usage
#' retrieveGenesInfo(theObject, groupBy="clusters",
#' orderGenes="initial", getUniprot=TRUE, cores=2,
#' saveInfos=FALSE)
#'
#' @param theObject An Object of class scRNASeq for which the count matrix was
#' normalized (see ?normaliseCountMatrix), tSNE were calculated (see
#' ?generateTSNECoordinates), dbScan was run (see ?runDBSCAN), cells were
#' clustered (see ?clusterCellsInternal), as clusters themselves (see
#' ?calculateClustersSimilarity), and ?rankGenes as ?retrieveTopMarkers.
#' @param groupBy A column in the input table used for grouping the genes in
#' the output tables. This option is useful if a table contains genes from
#' different clusters. Default = "clusters".
#' @param orderGenes If "initial" then the order of genes will not be changed.
#' The other option is "alphabetical". Default = "initial".
#' @param getUniprot Boolean, whether to get information from UniProt or not.
#' Default = TRUE.
#' @param cores Maximum number of jobs that CONCLUS can run in parallel.
#' Default is 1.
#' @param saveInfos If TRUE, save the genes infos table in the directory
#' defined in theObject (?getOutputDirectory), in the sub-directory
#' 'marker_genes/saveGenesInfo'.
#'
#' @details
#' The output dataframe is composed of the following columns:
#'
#' - uniprot_gn_symbol: Uniprot gene symbol. \cr
#' - clusters: The cluster to which the gene is associated. \cr
#' - external_gene_name: The complete gene name. \cr
#' - go_id: Gene Ontology (GO) identification number. \cr
#' - mgi_description: If the species is mouse, description of the gene
#' on MGI. \cr
#' - entrezgene_description: Description of the gene by Entrez database. \cr
#' - gene_biotype: protein coding gene, lincRNA gene, miRNA gene, unclassified
#' non-coding RNA gene, or pseudogene. \cr
#' - chromosome_name: The chromosome on which the gene is located. \cr
#' - Symbol: Official gene symbol. \cr
#' - ensembl_gene_id: ID of the gene on the ensembl database. \cr
#' - mgi_id: If the species is mouse, ID of the gene on the MGI database. \cr
#' - entrezgene_id: ID of the gene on the entrez database. \cr
#' - uniprot_gn_id: ID of the gene on the uniprot database. \cr
#'
#' @aliases retrieveGenesInfo
#'
#' @rdname
#' retrieveGenesInfo-scRNAseq
#'
#' @return
#' Display a table with the info retrieved.
#'
#' @examples
#' ## Object scr containing the results of previous steps
#' load(system.file("extdata/scrFull.Rdat", package="conclus"))
#'
#' ## Getting genes info
#' scr <- retrieveGenesInfo(scr, cores=2)
#'
#' @seealso
#' rankGenes retrieveTopClustersMarkers
#'
#' @exportMethod retrieveGenesInfo
#' @importFrom dplyr group_by %>% summarise
#'
#' @author
#' Ilyess RACHEDI, based on code by Polina PAVLOVICH and Nicolas DESCOSTES.
setMethod(
f = "retrieveGenesInfo",
signature = "scRNAseq",
definition = function(theObject, groupBy="clusters",
orderGenes="initial", getUniprot=TRUE, cores=2,
saveInfos=FALSE){
species <- getSpecies(theObject)
genes <- getTopMarkers(theObject)
.checkParamsretrieveGenesInfo(theObject, orderGenes, genes, species,
saveInfos)
## Additional Special database and columns according to species
if(isTRUE(all.equal(species, "mouse"))){
speDbID <- "mgi_id"
speDBdescription <- "mgi_description"
}else
speDbID <- speDBdescription <- spDB <- NULL
database <- .defineDatabase(species, genes, speDbID,
speDBdescription)
## Order the data frames colums to be abe to bind them
result <- .orderCol(speDBdescription, speDbID, database, orderGenes,
genes)
rm(database)
setGenesInfos(theObject) <- result
if(saveInfos)
.saveGenesInfo(theObject)
.displayInfoMarkers(result)
return(theObject)
})
###################
## retrieveTopClustersMarkers
###################
.checkParamsTopClust <- function(sceObject, markerGenesList,
numberOfClusters){
## Check if the normalized matrix is correct
if(all(dim(sceObject) == c(0,0)))
stop("The 'scRNAseq' object that you're using with ",
"'retrieveTopClustersMarkers' function doesn't have its ",
"'sceNorm' slot updated. Please use 'normaliseCountMatrix' on",
" the object before.")
## Check if the SCE object contain cluster colums in its colDF
if(!("clusters" %in% names(colData(sceObject))))
stop("The 'scRNAseq' object that you're using with ",
"'retrieveTopClustersMarkers' function doesn't have a correct",
" 'sceNorm' slot. This slot should be a 'SingleCellExperiment'",
" object containing 'clusters' column in its colData. Please ",
"check if you correctly used 'clusterCellsInternal' on the ",
"object.")
## Check if the markerGenesList is correct
if(!isTRUE(all.equal(length(markerGenesList),numberOfClusters)))
stop("Something wrong with number of clusters. It is supposed",
" to be equal to : ", numberOfClusters, ". Current ",
"number: ", length(markerGenesList), ". Did you use ",
"'calculateClustersSimilarity' and 'rankGenes'?")
}
#' .writeMarkersList
#'
#' @description
#' Writes one list per cluster in the output folder defined in theObject, and
#' in the sub-directory marker_genes/markers_lists.
#'
#' @param theObject An Object of class scRNASeq for which rankGenes was
#' run. See ?rankGenes.
#' @param topMarkers Data frame containing two columns geneName and
#' clusters.
#' @param nTop Number of marker genes to retrieve per cluster.
#'
#' @keywords internal
#' @noRd
.writeMarkersList <- function(theObject, topMarkers, nTop){
## Creating the output folder
dataDirectory <- getOutputDirectory(theObject)
outputDir <- file.path(dataDirectory, paste0("marker_genes/markers_lists"))
if(!file.exists(outputDir))
dir.create(outputDir, showWarnings=FALSE)
message("Writing lists of markers to ", outputDir)
## Creating a list of markers per clusters
markersClustersList <- split(topMarkers, topMarkers$clusters)
## Writing each element of the lists to a corresponding file
experimentName <- getExperimentName(theObject)
invisible(mapply(function(currentMarkers, currentClustNb, thres, expN){
outputName <- paste0(expN,"_cluster_", currentClustNb)
fileName <- paste0(outputName, "_markers.csv")
outputFile <- file.path(outputDir, fileName)
write.table(currentMarkers, file=outputFile, sep="\t",
quote=FALSE, row.names=FALSE, col.names=TRUE)
}, markersClustersList, names(markersClustersList),
MoreArgs=list(nTop, experimentName)))
}
#' retrieveTopClustersMarkers
#'
#' @description
#' This function retrieves the top N marker genes for each cluster.
#'
#' @usage
#' retrieveTopClustersMarkers(theObject, nTop=10, removeDuplicates = TRUE,
#' writeMarkerGenes = FALSE)
#'
#' @param theObject An Object of class scRNASeq for which rankGenes was
#' run. See ?rankGenes.
#' @param nTop Number of marker genes to retrieve per cluster. Default=10.
#' @param removeDuplicates If TRUE, duplicated markers are removed from the
#' lists. Default=TRUE.
#' @param writeMarkerGenes If TRUE, writes one list per cluster in the output
#' folder defined in theObject, and in the sub-directory
#' marker_genes/markers_lists. Default=FALSE.
#'
#' @aliases retrieveTopClustersMarkers
#' @rdname
#' retrieveTopClustersMarkers-scRNAseq
#'
#' @return
#' Output the list of markers to marker_genes/markers_lists if writeMarkersGenes
#' is TRUE and return a scRNASeq object with its clustersMarkers slot updated.
#'
#' @examples
#' ## Object scr containing the results of previous steps
#' load(system.file("extdata/scrFull.Rdat", package="conclus"))
#'
#' ## Retrieve the top 10 markers per cluster
#' scr <- retrieveTopClustersMarkers(scr)
#'
#' @seealso
#' retrieveGenesInfo
#'
#' @exportMethod retrieveTopClustersMarkers
#' @author
#' Ilyess RACHEDI, based on code by Polina PAVLOVICH and Nicolas DESCOSTES.
setMethod(
f = "retrieveTopClustersMarkers",
signature = "scRNAseq",
definition = function(theObject, nTop=10, removeDuplicates=TRUE,
writeMarkerGenes = FALSE){
markerGenesList <- getMarkerGenesList(theObject)
sceObject <- getSceNorm(theObject)
clustVec <- SummarizedExperiment::colData(sceObject)$clusters
clusterIndexes <- unique(clustVec)
numberOfClusters <- length(clusterIndexes)
.checkParamsTopClust(sceObject, markerGenesList, numberOfClusters)
##Retrieve the nTop genes in each element of markerGenesList
geneName <- unlist(lapply(markerGenesList,
function(currentMarkers)
currentMarkers$Gene[seq_len(nTop)]))
clusters <- rep(clusterIndexes, each=nTop)
topMarkers <- data.frame(geneName, clusters)
## Checking if duplicates should be removed. Set to FALSE if the
## number of clusters after duplicate removal is not the same than
## the number of clusters found.
if(removeDuplicates){
test <- topMarkers[!duplicated(topMarkers$geneName), ]
clustSimOrdered <- getClustersSimilarityOrdered(theObject)
nbClustMark <- length(unique(test$clusters))
nbClust <- length(clustSimOrdered)
if(nrow(test) > 1 &&
!isTRUE(all.equal(nbClust, nbClustMark)))
message("Duplicates are not removed to conserve the number",
" of clusters.")
else
topMarkers <- test
}
if(writeMarkerGenes)
.writeMarkersList(theObject, topMarkers, nTop)
setTopMarkers(theObject) <- topMarkers
return(theObject)
})
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.