#' @title bioLOGIC
#'
#' @description This function allows you to express your love for the superior furry animal.
#' @param agree Do you agree dogs are the best pet? Defaults to TRUE.
#' @keywords dogs
#' @export
#' @import DESeq2
#' @import methods
#library("methods")
#library(DESeq2)
setClass(
"bioLOGIC",
representation(
clusterSigEnrichmentList = "list",
documentationParams = "list",
sampleDetailList = "list",
dbDetailList = "list",
projectDetailList = "list",
scDetailList = "list",
referenceTableList = "list",
parameterList = "list",
dfGeneAnnotation = "data.frame",
dfPCA = "data.frame",
dfPCAgenes = "data.frame",
PCApercentVar = "numeric",
dfTPM = "data.frame",
dfFPKM = "data.frame",
RSEMcountMatrix = "matrix",
dfDesign = "data.frame",
dfModel = "data.frame",
dfSummary = "data.frame",
databaseTable = "data.frame",
reportVec = "character",
scriptVec = "character",
documentationVector = "character",
GSEAtableList = "list",
categoryViewTableList = "list",
plotCollection = "list",
ObjDds = "DESeqDataSet",
DESeqNormReadCountsTable = "data.frame",
DEseq2contrastTable = "data.frame",
DEseq2LRTtable = "data.frame",
enrichmentList = "list",
dataTableList = "list"
)
)
setClass(
"bioLOGIC_proteomics",
contains = "bioLOGIC",
slots = c(
extra = "character"
)
)
setClass(
"bioLOGIC_singleCell",
contains = "bioLOGIC"
)
setClass(
"bioLOGIC_ATACseq",
contains = "bioLOGIC"
)
setClass(
"bioLOGIC_bulkRNAseq",
contains = "bioLOGIC"
)
###############################################################################
## Add to script vector ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
setGeneric(
name="add2vec",
def=function(obj, slot_name, value) {
`slot<-`(obj,slot_name,value=c(slot(obj,slot_name),value))
}
)
###############################################################################
## Add to script vector ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
setGeneric(
name="setMountingPoint",
def=function(obj) {
if (dir.exists("/Volumes/babs/working/boeings/")){
hpcMount <- "/Volumes/babs/working/boeings/"
} else if (dir.exists("Y:/working/boeings/")){
hpcMount <- "Y:/working/boeings/"
} else if (dir.exists("/camp/stp/babs/working/boeings/")){
hpcMount <- "/camp/stp/babs/working/boeings/"
} else {
hpcMount <- ""
}
obj@parameterList$hpcMount <- hpcMount
return(obj)
}
)
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="setAnalysisPaths",
def=function(obj) {
obj@parameterList$AlignSampleDir <- paste0(obj@parameterList$workdir, "RSEM/Ensembl")
obj@parameterList$AlignOutputEnsDir <- paste0(obj@parameterList$workdir, "RSEM/Ensembl/")
obj@parameterList$FASTQCdir <- paste0(obj@parameterList$workdir, "FASTQC/")
obj@parameterList$logDir <- paste0(obj@parameterList$workdir, "logs/")
obj@parameterList$DEseq2Dir <- paste0(obj@parameterList$localWorkDir, "DESeq2/")
return(obj)
}
)
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="setDataBaseParameters",
def=function(obj){
## Primary data table
#obj@dbDetailList$primDataDB <- paste0(
# substr(obj@parameterList$project_id, 1,3),
# "_data"
#)
obj@parameterList$rnaseqdbTableName <- paste0(
obj@parameterList$project_id,
"_",
obj@parameterList$experiment.type,
"_table"
)
##Lab table ##
#obj@parameterList$lab.categories.table <- paste0(
# substr(obj@parameterList$project_id,1,2),
# "_lab_categories"
#)
##PCA table ##
obj@parameterList$PCAdbTableName <- paste0(
obj@parameterList$project_id,
"_PCA"
)
## Cat reference tables ##
obj@parameterList$cat.ref.db.table = paste0(
obj@parameterList$project_id,
"_cat_reference_db_table"
)
obj@parameterList$cat.ref.db <- obj@dbDetailList$primDataDB
obj@parameterList$enriched.categories.dbTableName <- paste0(
obj@parameterList$project_id,
"_enriched_categories_table"
)
return(obj)
}
)
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="setCrickGenomeAndGeneNameTable",
def=function(obj, genomeDir="/camp/svc/reference/Genomics/babs"){
releaseID <- gsub("release-", "", obj@parameterList$release)
## Set referemce file, gtf file ##
if (obj@parameterList$species == "mus_musculus"){
obj@parameterList$genome <- "GRCm38"
obj@parameterList$primaryAlignmentGeneID <- "ENSMUSG"
if (obj@parameterList$release == "release-89"){
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"20171206.release-89.mm.ENSMUSG.mgi.entrez.uniprot.description.hgnc.table.txt"
)
} else {
stop("No valid gene reference file specified")
}
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",
releaseID,
".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/genome/Mus_musculus.GRCm38.dna_sm.toplevel.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/genome/Mus_musculus.GRCm38.dna_sm.toplevel.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "mgi_symbol"
obj@parameterList$bedFile <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",releaseID,".bed"
)
obj@parameterList$refFlatFile <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"/gtf/Mus_musculus.GRCm38.",releaseID,".rRNA.interval_list"
)
obj@parameterList$bowtieGenomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/mus_musculus/ensembl/GRCm38/",
obj@parameterList$release,
"genome_idx/bowtie2/Mus_musculus.GRCm38.dna_sm.toplevel"
)
} else if (obj@parameterList$species == "homo_sapiens"){
obj@parameterList$genome <- "GRCh38"
obj@parameterList$primaryAlignmentGeneID <- "ENSG"
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"20171206.release-89.hs.ENSG.mgi.entrez.uniprot.description.mgi.table.txt"
)
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "hgnc_symbol"
obj@parameterList$bedFile = paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".bed"
)
obj@parameterList$refFlatFile = paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome_idx/bowtie2/Homo_sapiens.GRCh38.dna_sm.primary_assembly"
)
} else if (obj@parameterList$species == "danio_rerio"){
obj@parameterList$genome <- "GRCz10"
obj@parameterList$primaryAlignmentGeneID <- "ENSDARG"
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"20180724.release-89.dr.ENSDARG.hgnc.description.table.txt"
)
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/genome/Danio_rerio.GRCz10.dna_sm.toplevel.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/genome/Danio_rerio.GRCz10.dna_sm.toplevel.fa.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "dr_symbol"
obj@parameterList$bedFile <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".bed"
)
obj@parameterList$refFlatFile <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"/gtf/Danio_rerio.GRCz10.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/danio_rerio/ensembl/GRCz10/",
obj@parameterList$release,
"genome_idx/bowtie2/Danio_rerio.GRCz10.dna_sm.toplevel."
)
} else if (obj@parameterList$species == "gallus_gallus"){
obj@parameterList$genome <- "GRCg6a"
obj@parameterList$primaryAlignmentGeneID <- "ENSGALG"
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"2021.ENSGALG.biomart.plus.hgnc.txt"
)
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/genome/Gallus_gallus.GRCg6a.dna_sm.toplevel.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/genome/Gallus_gallus.GRCg6a.dna_sm.toplevel.fa.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "gg_symbol"
obj@parameterList$bedFile <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".bed"
)
obj@parameterList$refFlatFile <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"/gtf/Gallus_gallus.GRCg6a.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/gallus_gallus/ensembl/GRCg6a/",
obj@parameterList$release,
"genome_idx/bowtie2/Gallus_gallus.GRCg6a.dna_sm.toplevel."
)
} else if (obj@parameterList$species == "homo_sapiens"){
obj@parameterList$genome <- "GRCh38"
obj@parameterList$primaryAlignmentGeneID <- "ENSG"
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"20171206.release-89.hs.ENSG.mgi.entrez.uniprot.description.mgi.table.txt"
)
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "hgnc_symbol"
obj@parameterList$bedFile = paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".bed"
)
obj@parameterList$refFlatFile = paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/gtf/Homo_sapiens.GRCh38.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/homo_sapiens/ensembl/GRCh38/",
obj@parameterList$release,
"/genome_idx/bowtie2/Homo_sapiens.GRCh38.dna_sm.primary_assembly"
)
} else if (obj@parameterList$species == "drosophila_melanogaster"){
obj@parameterList$genome <- "BDGP6"
obj@parameterList$primaryAlignmentGeneID <- "ENSVDME"
## To be changed
obj@parameterList$path2GeneIDtable <- paste0(
obj@parameterList$hpcMount,
"Projects/reference_data/gene_id_annotation_files/",
"20190611.release-89.mm.ENSDMELG.hgnc.uniprot.description.table.txt"
)
obj@parameterList$GTFfile <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".rnaseqc.gtf"
)
obj@parameterList$genomeFa <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/genome/Drosophila_melanogaster.BDGP6.dna_sm.toplevel.fa"
)
obj@parameterList$genomeFai<- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/genome/Drosophila_melanogaster.BDGP6.dna_sm.toplevel.fa.fai"
)
obj@parameterList$rRNAfile <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".rRNA.list"
)
obj@parameterList$geneIDcolumn <- "Dmel_symbol"
obj@parameterList$bedFile <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".bed"
)
obj@parameterList$refFlatFile <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".rRNA.refflat"
)
obj@parameterList$ribosomalIntervalList <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"/gtf/Drosophila_melanogaster.BDGP6.",releaseID,".rRNA.interval_list"
)
obj@parameterList$genomeidx <- paste0(
"/camp/svc/reference/Genomics/babs/drosophila_melanogaster/ensembl/BDGP6/",
obj@parameterList$release,
"genome_idx/bowtie2/Drosophila_melanogaster.BDGP6.dna_sm.toplevel."
)
} else {
stop(" No valid species specified in the parameterList.")
}
## Set genome index ##
obj@parameterList$genomeIndex <- paste0(
genomeDir,
"/",
obj@parameterList$species
,"/ensembl/",
obj@parameterList$genome,
"/",
obj@parameterList$release,
"/genome_idx/rsem/star/",
obj@parameterList$read.length,
"/genome"
)
return(obj)
}
)
###############################################################################
## Create required folders ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createAnalysisFolders",
def=function(
obj
){
obj@parameterList$datadir <- paste0(
obj@parameterList$folder,
"basedata/"
)
## Create basedata folder ##
obj@parameterList$localDataDir <- paste0(
obj@parameterList$folder,
"basedata/"
)
if (!dir.exists(obj@parameterList$localDataDir)){
dir.create(obj@parameterList$localDataDir)
}
## Create workdir ##
obj@parameterList$workdir <- paste0(
obj@parameterList$folder,
"workdir/"
)
obj@parameterList$localWorkDir <- paste0(
obj@parameterList$folder,
"workdir/"
)
if (!dir.exists(obj@parameterList$localWorkDir)){
dir.create(obj@parameterList$localWorkDir)
}
## Create fastq dir ##
obj@parameterList$fastqDir <- paste0(
obj@parameterList$folder,
"FASTQ_files/"
)
obj@parameterList$localFastqDir <- paste0(
obj@parameterList$folder,
"FASTQ_files/"
)
if (!dir.exists(obj@parameterList$localFastqDir)){
dir.create(obj@parameterList$localFastqDir)
}
## Create outputdir ##
obj@parameterList$outputDir <- paste0(
obj@parameterList$folder,
"outputs/"
)
if (!dir.exists(obj@parameterList$outputDir)){
dir.create(obj@parameterList$outputDir)
}
## Create design file name ##
obj@parameterList$designFN <- paste0(
obj@parameterList$project_id,
".design.file.txt"
)
return(obj)
}
)
## ##
###############################################################################
###############################################################################
## Add annotation ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="addGeneAnnotation",
def=function(
obj,
addUniprotColumn = FALSE
){
dfAnno <- read.delim(
obj@parameterList$path2GeneIDtable,
header = TRUE,
sep = "\t",
stringsAsFactors = FALSE
)
dfAnno$for_GSEA_gene_chip <- NULL
dfAnno$Gene.name <- NULL
dfAnno$Gene.type <- NULL
dfAnno$Gene_description <- NULL
obj@dfGeneAnnotation <- dfAnno
return(obj)
}
)
## Done adding annotation ##
###############################################################################
###############################################################################
## Add FPKM/TPM ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="addTPMorFPKMtable",
def=function(
obj = "biologic objec",
addTPM = TRUE,
addFPKM = FALSE
){
count.data.file <- paste0(
obj@parameterList$localWorkDir,
"RSEM/",
obj@parameterList$RSEMcountDataFile
)
count.data.file <- paste0(
obj@parameterList$localWorkDir,
"RSEM/",
obj@parameterList$RSEMcountDataFile
)
###############################################################################
# Prepare TPM and FPKM tables #
###############################################################################
## Make sure dfDesign is ordered properly ##
samples <- as.vector(
unique(
obj@dfDesign$sample.id
)
)
files <- paste(
obj@parameterList$localWorkDir,
"RSEM/Ensembl/",
samples,
".genes.results",
sep=""
)
#library(SBwebtools)
list.tpm.fpkm <- create.tpm.and.fpkm.tables(
workdir = obj@parameterList$localWorkDir,
samples = samples,
files = files
)
if (addFPKM){
obj@dfFPKM <- list.tpm.fpkm$df.fpkm
names(obj@dfFPKM) <- gsub("gene_id", obj@parameterList$primaryAlignmentGeneID, names(obj@dfFPKM))
names(obj@dfFPKM) <- gsub(".fpkm", "", names(obj@dfFPKM))
rSums <- rowSums(obj@dfFPKM[,2:ncol(obj@dfFPKM)])
obj@dfFPKM <- obj@dfFPKM[rSums > 0, ]
} else if (addTPM) {
obj@dfTPM <- list.tpm.fpkm$df.tpm
names(obj@dfTPM) <- gsub("gene_id", obj@parameterList$primaryAlignmentGeneID, names(obj@dfTPM))
names(obj@dfTPM) <- gsub(".tpm", "", names(obj@dfTPM))
rSums <- rowSums(obj@dfTPM[,2:ncol(obj@dfTPM)])
obj@dfTPM <- obj@dfTPM[rSums > 0, ]
} else {
print("Nothing added.")
}
rm(list.tpm.fpkm)
return(obj)
}
)
## Add FPKM/TPM ##
###############################################################################
###############################################################################
## Add FPKM/TPM ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="prepareRSEMcountMatrix",
def=function(
obj = "biologic objec",
string.to.be.deleted.in.raw.counts.columns = paste0("X", gsub("/", ".",paste0(obj@parameterList$workdir, "RSEM/Ensembl/")))
){
obj@RSEMcountMatrix <- readAndPrepareCountMatrix(
count.data.fn = paste0(
obj@parameterList$localWorkDir,
"RSEM/",
obj@parameterList$RSEMcountDataFile
),
string.to.be.deleted.in.raw.counts.columns = string.to.be.deleted.in.raw.counts.columns,
df.design = obj@dfDesign
)
return(obj)
}
)
## Done preparing RSEM matrix ##
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#'
#'
setGeneric(
name="createRNAseqQCscript",
def=function(
obj = "biologic objec",
scriptVecSlot = "scriptVec",
bamSuffix = "STAR.genome.bam"
){
obj@parameterList$RnaSQCbaseDataDir <- paste0(
obj@parameterList$workdir, "RSEM/Ensembl"
)
obj@parameterList$RnaSQCBamSuffix <- bamSuffix
tempShellScriptVector <- as.vector(NULL, mode = "character")
if (obj@parameterList$stranded){
strandedness<- "SECOND_READ_TRANSCRIPTION_STRAND"
} else {
strandedness<- "NONE"
}
## Order samples so that sortin is > condition > sample.group > sample.id
#df.design <- df.design[order(df.design$dataseries, df.design$sample.group, df.design$sample.id),]
samples = as.vector(
unique(
obj@dfDesign[,"sample.id"]
)
)
tempShellScriptVector <- c(
tempShellScriptVector,
'#!/bin/sh',
'\n',
'#Copy this shell script into the project directory and run it from there.',
'\n',
'#################################################################################',
'\n',
'##Create log directory ##########################################################',
'\n',
'if [ ! -d logs ]; then',
'\n',
' mkdir logs',
'\n',
'fi',
'\n',
'\n',
'#################################################################################',
'\n',
'###VARIABLES#####################################################################',
'\n',
'#################################################################################',
'\n',
paste0('project="', obj@parameterList$project_id, '"'),
'\n',
'projectID=""',
'\n',
'#Path to the directory with the BAM files', '\n',
'#FASTQ files have to be named [sample_name_as_given_in_samples_below]_R1.fastq.gz or [sample_name_as_given_in_samples_below]_R2.fastq.gz', '\n',
paste0('alignDir="', obj@parameterList$RnaSQCbaseDataDir, '"'), '\n',
paste0('GTFfile="', obj@parameterList$GTFfile, '"'), '\n',
#paste0('GTFfile_RNASeQC="', GTFfile.RNASeQC, '"'), '\n',
paste0('rRNAfile="', obj@parameterList$rRNAfile, '"'),
'\n',
'\n',
paste0('samplesuffix="',bamSuffix,'"'),
'\n',
'\n',
paste0('genome_fa="',obj@parameterList$genomeFa,'"'), '\n'
)
for (i in 1:length(samples)){
if (i ==1){
tempShellScriptVector <- c(
tempShellScriptVector,
paste0('samples="', samples[i])
)
} else {
tempShellScriptVector <- c(
tempShellScriptVector,
'\n', samples[i]
)
}
}
tempShellScriptVector <- c(
tempShellScriptVector,
'"', '\n',
'\n',
'\n',
'#################################################################################', '\n',
'##FUNCTIONS######################################################################', '\n',
'#################################################################################', '\n',
'\n',
'wait_on_lsf() { ## wait on jobs{', '\n',
'sleep 300', '\n',
'n=`squeue --name=$project | wc -l`', '\n',
#'n=n-1',
'\n',
'while [ $n -ne 1 ]', '\n',
'do', '\n',
'n=`squeue --name=$project | wc -l`', '\n',
#'n=n-1',
'\n',
'((z=$n-1))', '\n',
'#number of running', '\n',
'echo "$project jobs ($projectID) running: $z"',
'\n',
'#number of pending', '\n',
'sleep 300', '\n',
'done', '\n',
'}', '\n',
'\n',
'\n',
'\n',
'## End of function ##', '\n',
'#################################################################################', '\n',
'\n', '\n',
'#################################################################################', '\n',
'# Prere bam files for RNASeQC #', '\n',
'#################################################################################', '\n',
'#################################################################################', '\n',
'# AddOrReplaceReadGroups #', '\n',
'#################################################################################', '\n','\n',
'projectID="AddOrReplaceReadGroups"', '\n',
'module load R/3.3.1-foss-2016b-bioc-3.3-libX11-1.6.3', '\n','\n',
'echo "module load R/3.3.1-foss-2016b-bioc-3.3-libX11-1.6.3" >> commands.txt', '\n',
'module load picard/2.1.1-Java-1.8.0_112', '\n','\n',
'echo "module load picard/2.1.1-Java-1.8.0_112" >> commands.txt', '\n',
'echo "redo headers on Tophat bam output"', '\n',
'echo "#Submitted jobs" >> commands.txt ', '\n',
'#projectID="headers"', '\n',
'for sample in $samples', '\n',
' do ', '\n',
' echo "java -jar ${EBROOTPICARD}/picard.jar AddOrReplaceReadGroups \\', '\n',
' I=${alignDir}/${sample}.${samplesuffix} \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' RGID=$sample \\', '\n',
' RGCN=CCCB \\', '\n',
' RGLB=lib1 \\', '\n',
' RGPL=ILLUMINA \\', '\n',
' RGPU=NA \\', '\n',
' RGSM=accepted_hits.bam" >> commands.txt', '\n',
' sbatch --time=12:00:00 --wrap "java -jar ${EBROOTPICARD}/picard.jar AddOrReplaceReadGroups \\', '\n',
' I=${alignDir}/${sample}.${samplesuffix} \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' RGID=$sample \\', '\n',
' RGCN=TheFrancisCrickInstitute \\', '\n',
' RGLB=lib1 \\', '\n',
' RGPL=ILLUMINA \\', '\n',
' RGPU=NA \\', '\n',
' RGSM=accepted_hits.bam" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.addorreplacereadgroups.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n', '\n',
'#################################################################################', '\n',
'# SortSam #', '\n',
'#################################################################################', '\n', '\n',
'projectID="SortSam"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "SortSam co-ordinate sort" >> commands.txt', '\n',
'echo "SortSam output"', '\n',
'#projectID="coordinate_sort"', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar SortSam \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
' SO=coordinate \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt ', '\n',
' sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar SortSam \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
' SO=coordinate \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.sortsam.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'for sample in $samples', '\n',
'do', '\n',
'echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.bam" >> commands.txt ', '\n',
'rm ${alignDir}/${sample}.accepted_hits.readgroups.bam ', '\n',
'done', '\n',
'\n',
'#################################################################################', '\n',
'# Reorder SAM #', '\n',
'#################################################################################', '\n', '\n',
'projectID="ReorderSam"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "reorder reads to match contigs in the reference" >> commands.txt', '\n',
'echo "Reorder reads to match contigs in the reference"', '\n',
'for sample in $samples', '\n',
'do ', '\n',
'echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar ReorderSam \\', '\n',
'I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
'O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
'REFERENCE=${genome_fa} \\', '\n',
'TMP_DIR=\'pwd\'/tmp" >> commands.txt ', '\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar ReorderSam \\', '\n',
'I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
'O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
'REFERENCE=${genome_fa} \\', '\n',
'TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.reordersam.slurm >> commands.txt ', '\n',
'\n',
'done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam" >> commands.txt', '\n',
' rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam', '\n',
' done', '\n',
'\n',
'#################################################################################', '\n',
'# MarkDuplicates #', '\n',
'#################################################################################', '\n', '\n',
'projectID="MarkDuplicates"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="markdups"', '\n',
'echo "mark duplicates" >> commands.txt', '\n',
'echo "mark duplicates"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar MarkDuplicates \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
' O=${alignDir}/${sample}.d.bam \\', '\n',
' METRICS_FILE=${alignDir}/${sample}/dup_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
' sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar MarkDuplicates \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
' O=${alignDir}/${sample}.d.bam \\', '\n',
' METRICS_FILE=${alignDir}/${sample}/dup_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.markduplicates.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'##############################', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam" >> commands.txt', '\n',
' rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam', '\n',
' done', '\n',
'#################################################################################', '\n',
'# Samtool indexing #', '\n',
'#################################################################################', '\n','\n',
'projectID="Samtool indexing"', '\n',
'module load SAMtools/1.3.1-foss-2016b', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "module load SAMtools/1.3.1-foss-2016b" >> commands.txt', '\n',
'echo "Samtool Indexing "', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "samtools index ${alignDir}/${sample}.d.bam" >> commands.txt', '\n',
' sbatch --time=12:00:00 --wrap "samtools index ${alignDir}/${sample}.d.bam" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.samtools.index.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n'
)
###########################################################################
## Estimate library complexity ##
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run Estimate Library Complexity #', '\n',
'#################################################################################', '\n', '\n',
'projectID="EstimateLibraryComplexity"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="EstimateLibraryComplexity"', '\n',
'echo "EstimateLibraryComplexity" >> commands.txt', '\n',
'echo "EstimateLibraryComplexity"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar EstimateLibraryComplexity \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.est_lib_complex_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar EstimateLibraryComplexity \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.est_lib_complex_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.estimatelibrarycomplexity.slurm >> commands.txt ', '\n',
'\n',
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
## Done estimating library complexity ##
###########################################################################
###########################################################################
## Add rnaseq metrics ##
## Add on - do existence check ##
refFlatFile <- obj@parameterList$refFlatFile
if (length(refFlatFile) == 0){
refFlatFile <- ""
}
ribosomalIntervalList <- obj@parameterList$ribosomalIntervalList
if (length(ribosomalIntervalList) == 0){
ribosomalIntervalList <- ""
}
if (refFlatFile != "" & ribosomalIntervalList != ""){
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run CollectRnaSeqMetrics #', '\n',
'#################################################################################', '\n', '\n',
'projectID="CollectRnaSeqMetrics"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="CollectRnaSeqMetrics"', '\n',
'echo "CollectRnaSeqMetrics" >> commands.txt', '\n',
'echo "CollectRnaSeqMetrics"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar CollectRnaSeqMetrics \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.output.RNA_Metrics \\', '\n',
paste0(' REF_FLAT=',refFlatFile,' \\'), '\n',
paste0(' STRAND=',strandedness,' \\'), '\n',
paste0(' RIBOSOMAL_INTERVALS=',ribosomalIntervalList,' \\'), '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar CollectRnaSeqMetrics \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.output.RNA_Metrics \\', '\n',
paste0(' REF_FLAT=',refFlatFile,' \\'), '\n',
paste0(' STRAND=',strandedness,' \\'), '\n',
paste0(' RIBOSOMAL_INTERVALS=',ribosomalIntervalList,' \\'), '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.rnaseqmetrics.slurm >> commands.txt ', '\n',
'\n',
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
}
## End rnaseq metrics ##
###########################################################################
###########################################################################
## Add infer experiment ##
bedFile <- obj@parameterList$bedFile
if (length(bedFile) == 0){
bedFile <- ""
}
if (bedFile != ""){
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run RNASeQC Infer_experiment #', '\n',
'#################################################################################', '\n', '\n',
'projectID="Infer_experiment"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="Infer_experiment"', '\n',
'echo "Infer_experiment" >> commands.txt', '\n',
'echo "Infer_experiment"', '\n',
'module purge;','\n',
'module load RSeQC/2.6.4-foss-2016b-Python-2.7.12-R-3.3.1;', '\n',
' for sample in $samples', '\n',
' do', '\n',
paste0(
'echo "infer_experiment.py -r ',
bedFile,
' -i ${alignDir}/${sample}.d.bam > ${alignDir}/${sample}.infer_experiment.txt" >> commands.txt \\'
),
'\n',
'\n',
paste0('sbatch --time=12:00:00 --wrap "infer_experiment.py -r ',
bedFile,
' -i ${alignDir}/${sample}.d.bam > ${alignDir}/${sample}.infer_experiment.txt" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.infer_experiment.slurm >> commands.txt '
),
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
}
## End rnaseq metrics ##
###########################################################################
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run RNASeqC #', '\n',
'#################################################################################', '\n','\n',
'projectID="RNAseQC"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'module load RNA-SeQC/1.1.8-Java-1.7.0_80', '\n', '\n',
'echo "module load RNA-SeQC/1.1.8-Java-1.7.0_80" >> commands.txt', '\n',
'echo "make RNASeqC sample list"', '\n',
'cd ${alignDir}', '\n',
'\n',
'if [ ! -d ${projectID} ]; then', '\n',
' mkdir ${projectID}', '\n',
'fi', '\n',
'\n',
'\n',
'echo "sample list" > ${alignDir}/${projectID}/sample.list', '\n',
'for sample in $samples', '\n',
'do', '\n',
'echo -e "$sample\t${alignDir}/${sample}.d.bam\tNA" >>${alignDir}/${projectID}/sample.list', '\n',
'done', '\n',
'#wait', '\n',
'\n',
'echo "run RNA-seqQC"', '\n',
'\n',
'#RNASeqQC requires Java version 1.7 and does not run on the most recent version. ', '\n',
'\n',
'echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTRNAMINSEQC}/RNA-SeQC_v1.1.8.jar \\', '\n'
)
if (!obj@parameterList$paired.end){
tempShellScriptVector <- c(
tempShellScriptVector,
'-singleEnd \\', '\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'-o ${alignDir}/${projectID}/ \\', '\n',
'-r ${genome_fa} \\', '\n',
'-s ${alignDir}/${projectID}/sample.list \\', '\n',
'-t $GTFfile \\', '\n',
'-gatkFlags \'-S SILENT -U ALLOW_SEQ_DICT_INCOMPATIBILITY\' \\', '\n',
'-rRNA $rRNAfile " >> commands.txt', '\n',
'\n',
'sbatch --time=48:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTRNAMINSEQC}/RNA-SeQC_v1.1.8.jar \\', '\n'
)
if (!obj@parameterList$paired.end){
tempShellScriptVector <- c(
tempShellScriptVector,
'-singleEnd \\', '\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'-o ${alignDir}/${projectID}/ \\', '\n',
'-r ${genome_fa} \\', '\n',
'-s ${alignDir}/${projectID}/sample.list \\', '\n',
'-t $GTFfile \\', '\n',
'-gatkFlags \'-S SILENT -U ALLOW_SEQ_DICT_INCOMPATIBILITY\' \\', '\n',
'-rRNA $rRNAfile " --job-name=$project -c 1 --mem-per-cpu=7000 -o ${alignDir}/rnaseqc.slurm >> ${alignDir}/commands.txt ', '\n',
'\n',
'\n',
'wait_on_lsf',
'\n',
'module purge; module use /camp/stp/babs/working/software/modules/all; module load multiqc/1.3-2016b-Python-2.7.12',
'\n',
paste0("multiqc ", obj@parameterList$workdir),
'\n',
'#end of file', '\n'
)
obj <- add2vec(
obj = obj,
slot_name = scriptVecSlot,
value = tempShellScriptVector
)
return(obj)
}
)
###############################################################################
## Create dds base object ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @import DESeq2
#' @export
#'
#'
setGeneric(
name="createDdsObject",
def=function(obj) {
library(DESeq2)
if (length(obj@parameterList$batchMode) == 0){
obj@parameterList$batchMode <- FALSE
}
if (obj@parameterList$batchMode){
colData = unique(obj@dfDesign[, c("sample.id", "sample.group","replicate")])
rownames(colData) = as.vector(colData$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
colData$replicate <- as.factor(colData$replicate)
obj@ObjDds <- DESeqDataSetFromMatrix(
countData = obj@RSEMcountMatrix,
colData = colData,
design = ~ replicate
)
} else {
colData = unique(obj@dfDesign[, c("sample.id", "sample.group")])
rownames(colData) = as.vector(colData$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
obj@ObjDds <- DESeqDataSetFromMatrix(
countData = obj@RSEMcountMatrix,
colData = colData,
design = ~ condition
)
}
obj <- add2vec(
obj = obj,
slot_name = "documentationVector",
value = 'DESeq2 Paramteters: test="Wald"; betaPrior=FALSE.'
)
obj@ObjDds <- DESeq(
obj@ObjDds,
test = "Wald",
betaPrior = FALSE,
parallel = obj@parameterList$parallelProcessing
)
## Extract norm counts ##
## RSEM-generate-matrix produces a raw-count matrix
if (length(obj@parameterList$DESeq2ToBeNormalized) == 0){
obj@parameterList$DESeq2ToBeNormalized <- TRUE
}
obj@DESeqNormReadCountsTable <- data.frame(
round(
counts(
obj@ObjDds, normalized=obj@parameterList$DESeq2ToBeNormalize
)
)
)
#Remove all rows 0 counts for all samples from df.normCounts
obj@DESeqNormReadCountsTable <- obj@DESeqNormReadCountsTable[rowSums(obj@DESeqNormReadCountsTable)!=0,]
return(obj)
}
)
## Done creating dds base object and norm counts table ##
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createPCAloadingPlots",
def=function(
obj
) {
library(ggplot2)
contrast_P_lg10p_PCA <- names(obj@dfPCAgenes)[grep("contrast_P_lg10p_PCA",names(obj@dfPCAgenes))]
plotTask <- gsub("contrast_P_lg10p_", "", contrast_P_lg10p_PCA )
dfAnno <- unique(obj@dfGeneAnnotation[,c(obj@parameterList$primaryAlignmentGeneID, obj@parameterList$geneIDcolumn)])
dfAnno <- dfAnno[dfAnno[,obj@parameterList$primaryAlignmentGeneID] %in% obj@dfPCAgenes[,obj@parameterList$primaryAlignmentGeneID],]
dfPlot <- merge(
obj@dfPCAgenes,
dfAnno,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID,
all = TRUE
)
dfPlot[is.na(dfPlot)] <- ""
dfPlot[dfPlot[,obj@parameterList$geneIDcolumn] == "", obj@parameterList$geneIDcolumn] <- dfPlot[dfPlot[,obj@parameterList$geneIDcolumn] == "", obj@parameterList$primaryAlignmentGeneID]
for (i in 1:length(plotTask)){
plotName <- paste0(plotTask[i], "_PCA_fitting")
cols <- c(obj@parameterList$geneIDcolumn, names(dfPlot)[grep(paste0(plotTask[i],"$"), names(dfPlot))])
dfPlotSel <- unique(dfPlot[,cols])
names(dfPlotSel) <- gsub(plotTask[i], "PCA", names(dfPlotSel))
dfPlotSel <- dfPlotSel[order(dfPlotSel$r2_PCA, decreasing = TRUE),]
tryCatch({
obj@plotCollection[[plotName]] <- ggplot(
data=dfPlotSel,
aes(
x=contrast_P_PCA_estimatePCA,
y=contrast_P_lg10p_PCA,
size = r2_PCA,
alpha = I(r2_PCA)
)) + geom_point(
) + labs(title = plotTask[i] ,x = "Estimate", y = "-log10(padjust)"
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
)
},
error = function(c) "Plot error",
warning = function(c) "warning",
message = function(c) "message"
)
}
return(obj)
}
)
###############################################################################
## Method Determine row variability ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="addCoVar",
def=function(
obj,
avgCountCutOffperSample = 0,
selectionColName = "aboveCutOff",
dfBaseData = "Obio@DESeqNormReadCountsTable",
rowNameID = "obj@parameterList$primaryAlignmentGeneID"
#options: "DEseq2RV" or "CoVar"
) {
###########################################################################
## Calculate average row value per sample ##
avgCountsPerGenePerSample <- round(
rowSums(dfBaseData)/ncol(dfBaseData),3
)
dfCountCutOff <- data.frame(avgCountsPerGenePerSample, names(avgCountsPerGenePerSample))
names(dfCountCutOff) <- c("avgCountsPerGenePerSample", rowNameID)
#########################################################################
## Calculate coefficient of variation ##
dfCoVar <- dfBaseData
## Remove all zero rows ##
dfCoVar["CoVar"]<- apply(
dfCoVar,
1,
function(x) sd(x)/mean(x)
)
dfCoVar[dfCoVar$CoVar == Inf, "CoVar"] <- max(dfCoVar[dfCoVar$CoVar < Inf ,"CoVar"])
dfCoVar[[rowNameID]] <- row.names(dfCoVar)
dfCoVar <- dfCoVar[,c(rowNameID, "CoVar")]
dfRv <- merge(
dfCountCutOff,
dfCoVar,
by.x = rowNameID,
by.y = rowNameID
)
dfRv[is.na(dfRv)] <- 0
dfRv <- dfRv[order(dfRv$CoVar, decreasing = T),]
## Done calculating coefficietn of variation ##
#########################################################################
#########################################################################
## Selection column ##
dfRv[[selectionColName]] <- ""
dfRv[dfRv$avgCountsPerGenePerSample >= avgCountCutOffperSample, selectionColName] <- "+"
## Done adding selection Column ##
#########################################################################
#########################################################################
## Determine DESEQ2 variation estimate ##
if(!is.null(obj@ObjDds)){
library(DESeq2)
if (length(unique(obj@dfDesign$sample.id)) > 42) {
rld <- vst(obj@ObjDds)
} else {
rld <- rlog(obj@ObjDds)
}
#########################################################################
## Create row variance df ##
# Rowvars definition https://www.rdocumentation.org/packages/metaMA/versions/3.1.2/topics/rowVars
DEseq2RV <- rowVars(assay(rld))
assign(rowNameID, names(rld))
dfRvAdd <- data.frame(
DEseq2RV,
get(obj@parameterList$primaryAlignmentGeneID)
)
names(dfRvAdd) <- c("DEseq2RV", rowNameID)
dfRv <- merge(
dfRv,
dfRvAdd,
by.x = rowNameID,
by.y = rowNameID
)
} #End deseq2 object
## Done with DESEQ2 ##
#########################################################################
#########################################################################
## Add results to object ##
obj@dataTableList[["dfRowVar"]] <- dfRv
## Done ##
#########################################################################
return(obj)
})
## Done Method determine row variability ##
#############################################################################
###############################################################################
## PCA Method ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="doPCA",
def=function(
obj,
Ntop4pca = 500,
avgCountCutOffperSample = 0,
pcaSelectionVec = NULL,
pcaDimensionsToInvestigate = c(1:5)
) {
if (!dir.exists(obj@parameterList$DEseq2Dir)){
dir.create(obj@parameterList$DEseq2Dir)
}
setwd(obj@parameterList$DEseq2Dir)
if (obj@parameterList$batchMode){
library(limma)
if (length(unique(obj@dfDesign$sample.id)) > 42) {
mPCA <- removeBatchEffect(assay(vst(obj@ObjDds)), obj@ObjDds$replicate)
} else {
mPCA <- removeBatchEffect(assay(rlog(obj@ObjDds)), obj@ObjDds$replicate)
}
pca <- prcomp(t(mPCA))
#rld <- rlog(ddsPCA, blind=FALSE)
} else {
if (length(unique(obj@dfDesign$sample.id)) > 42) {
rld <- vst(obj@ObjDds)
} else {
rld <- rlog(obj@ObjDds)
}
rv <- rowVars(assay(rld))
## Added variable genes table
if (is.null(pcaSelectionVec)){
select <- order(rv, decreasing = TRUE)[seq_len(Ntop4pca)]
obj@dataTableList[["Ntop4pcaGeneSelection"]] <- row.names(assay(rld)[select, ])
pcaSelectionVec <- row.names(assay(rld)[select, ])
}
pca = prcomp(t(assay(rld)[pcaSelectionVec, ]))
obj@PCApercentVar <- pca$sdev^2/sum(pca$sdev^2)
## Add percent variation plot ##
PercentVariation <- round(100*obj@PCApercentVar,1)
PCdimension <- paste0("PC", 1:length(PercentVariation))
df <- data.frame(
PercentVariation,
PCdimension
)
df <- df[df$PercentVariation > 0,]
library(ggplot2)
obj@plotCollection[["PCAvariationPerDimensionO"]] <- ggplot(df, aes(PCdimension, PercentVariation)) + geom_col() + scale_x_discrete(limits=PCdimension) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
)
#pcaFN <- "pca.table.txt"
#fn = paste("PCA_plot.sample.groups.normalized.counts.png", sep="")
#png(fn, type="cairo")
#dev.off()
## Adding gene annotations ##
dfBase <- assay(rld)[pcaSelectionVec, ]
dfBase <- t(dfBase)
pcaGenes = prcomp(scale(dfBase))
}
df.design.pca <- unique(obj@dfDesign[,c("sample.id", "sample.group")])
df.pca = data.frame(pca$x)
df.pca[["sample.id"]] <- row.names(df.pca)
df.pca <- merge(
df.design.pca,
df.pca,
by.x = "sample.id",
by.y = "sample.id"
)
df.pca <- df.pca[order(df.pca$sample.id),]
names(df.pca) <- gsub("[.]", "_", names(df.pca))
obj@dfPCA <- df.pca
## Add plot ##
tryCatch({
obj@plotCollection[["PCAd1d2plot"]] <- plotPCA(
rld,
ntop = Ntop4pca
)
},
error = function(c) "Plot error",
warning = function(c) "warning",
message = function(c) "message"
)
###########################################################################
## Get loadings ## ##
## Loadings for the first principal component##
dfPcaGenes = data.frame(pcaGenes$x)
dfPcaGenes[[obj@parameterList$primaryAlignmentGeneID]] <- row.names(dfPcaGenes)
dfLoad <- pcaGenes$rotation
dfLoad <- t(dfLoad)
dfBase <- data.frame(dfBase)
#######################################################################
## helper function ##
determinePCAloadings <- function(
lmFitDim = 'lmFitDim',
dfBase = 'dfBase',
primary.alignment.gene.id = 'primary.alignment.gene.id'
){
for (i in 1:ncol(dfBase)){
corVar <- names(dfBase)[i]
regression_formula <- as.formula(paste0("pcaGenes$x[,",lmFitDim,"]~", corVar))
fit <- lm(regression_formula, data=dfBase)
#summary(fit)
p.value <- summary(fit)$coefficients[corVar,"Pr(>|t|)"]
estimate <- summary(fit)$coefficients[corVar,"Estimate"]
intercept <- summary(fit)$coefficients[grep("Intercept", row.names(summary(fit)$coefficients)),"Estimate"]
rsquared <- summary(fit)$r.squared
new.row <- data.frame(p.value, estimate, intercept, rsquared)
row.names(new.row) <- corVar
if (i!=1){
dfRes <- rbind(dfRes, new.row)
} else {
dfRes <- new.row
}
}
dfRes <- dfRes[order(dfRes$p.value, decreasing = FALSE),]
dfRes[["padj"]] <- p.adjust(dfRes$p.value, method = "BH")
names(dfRes) <- paste0(names(dfRes), ".PCA", lmFitDim)
dfRes[[primary.alignment.gene.id]] <- row.names(dfRes)
## Add log10p column ##
padj <- names(dfRes)[grep("padj", names(dfRes))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(dfRes[,padj[z]])
if (length(grep("padj", padj[i])) > 0){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
} else {
preprocess <- as.numeric(dfRes[,padj[z]])
}
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
dfRes[,lg10p[z]] <- temp
}
## Done adding log10p ##
return(dfRes)
}
## helper function ##
#######################################################################
for (l in 1:length(pcaDimensionsToInvestigate)){
dfTemp <- determinePCAloadings(
lmFitDim = pcaDimensionsToInvestigate[l],
dfBase = dfBase,
primary.alignment.gene.id = obj@parameterList$primaryAlignmentGeneID
)
if (l ==1){
dfAll <- dfTemp
} else {
dfAll <- merge(
dfAll,
dfTemp,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID
)
}
print(paste0(l, " dimensions out of ", length(pcaDimensionsToInvestigate), " processed."))
}
names(dfAll) <- gsub("intercept.PCA", "intercept_PCA", names(dfAll))
names(dfAll) <- gsub("estimate.PCA", "contrast_P_PCA_estimatePCA", names(dfAll))
names(dfAll) <- gsub("padj.PCA", "contrast_P_padj_PCA", names(dfAll))
names(dfAll) <- gsub("lg10p.PCA", "contrast_P_lg10p_PCA", names(dfAll))
names(dfAll) <- gsub("rsquared.PCA", "r2_PCA", names(dfAll))
obj@dfPCAgenes <- dfAll
## Adding plots ##
obj <- createPCAloadingPlots(obj)
return(obj)
}
)
## Done with pca method ##
###############################################################################
###############################################################################
## PCA Method ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="doLinearFittings",
def=function(
obj,
designColSelector = "",
mode = "independentVariation", ## "independentVariation" or "dependentVariation"
Ntop4pca = 500,
plotname = "plotname"
) {
##
###########################################################################
## Create PCA table for database ##
library(gplots)
library(RColorBrewer)
library(lattice)
library(genefilter)
###########################################################################
## Create variation estimation ##
library(tidyr)
library(ggplot2)
designColSelector = unique(c(designColSelector, "sample.id"))
if (length(unique(obj@dfDesign$sample.id)) > 42) {
rld <- vst(obj@ObjDds)
} else {
rld <- rlog(obj@ObjDds)
}
rv = rowVars(assay(rld))
## Select most variable genes
select = order(rv, decreasing = TRUE)[seq_len(Ntop4pca)]
dfTemp = t(assay(rld)[select, ])
pc <- prcomp(dfTemp, center=TRUE, scale=FALSE)
colDatMin = unique(obj@dfDesign[, designColSelector])
rownames(colDatMin) = as.vector(colDatMin$sample.id)
colDatMin$sample.id <- NULL
#colnames(colData)[1] = "condition"
covar_PC_frame <- rbind(
data.frame(
Component=1:(nrow(pc$x)-1),
spread(
data.frame(
v=names(colDatMin),
val=NA_real_
),
key=v,
value=val
)
)
)
if (mode == "independentVariation"){
covar_PC_frame <- covar_PC_frame[c("Component", names(colDatMin))]
for (i in 1:nrow(covar_PC_frame)) {
## old code from gavin below ##
fit <- lm(pc$x[,i]~., data=colDatMin)
covar_PC_frame[i,-1] <- drop1(fit, test="F")[names(covar_PC_frame)[-1],"Pr(>F)"]
## replaced 25032019 ##
# Fit each variable individually @
}
} else {
mode <- "dependentVariation"
## Do fitting individually ##
## Check that all selVec entries exist
fitVars <- names(covar_PC_frame)
fitVars <- fitVars[fitVars != "Component"]
covar_PC_frame <- covar_PC_frame[c("Component", names(colDatMin))]
for (i in 1:nrow(covar_PC_frame)) {
## old code from gavin below ##
for (j in 1:length(fitVars)){
corVar <- fitVars[j]
if (length(unique(obj@dfDesign[, corVar])) > 1) {
pcDim <- paste0("pc$x[,",i,"]")
regressionFormula <- as.formula(paste(pcDim, corVar, sep="~"))
fit <- lm(regressionFormula, data=colDatMin)
pVal <- as.vector(summary(fit)$coefficients[,4][2])
covar_PC_frame[i, corVar] <- pVal
}
}
}
}
## Create plot ##
plotFrame <- gather(covar_PC_frame, key=Covariate, value=p, -Component)
plotFrame <- plotFrame[order(plotFrame$Component, decreasing = FALSE),]
if (nrow(plotFrame) > 20) {
plotFrame <- plotFrame[1:(length(names(colDatMin)) * 20), ]
}
## Cut to 10 dimensions ##
obj@plotCollection[[plotname]] <- ggplot(
plotFrame,
aes(x=Component, y=Covariate, fill=-log10(p))) +
geom_raster() +
scale_fill_gradient(low="grey90", high="red") +
theme_classic() +
coord_fixed() +
scale_x_continuous( labels = unique(plotFrame$Component), breaks = unique(plotFrame$Component)
)
return(obj)
}
)
## Done creating estimation variation ##
###########################################################################
###########################################################################
## Create sample dendrogram ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createSampleDendrogram",
def=function(
obj,
Ntop4pca = 500,
plotname = "dendrogram"
) {
library(ggdendro)
###########################################################################
## Create dendrogram ##
if (length(unique(obj@dfDesign$sample.id)) > 42) {
rld <- vst(obj@ObjDds)
} else {
rld <- rlog(obj@ObjDds)
}
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(Ntop4pca)]
normCounts <- (assay(rld)[select, ])
c <- cor(as.matrix(normCounts), method="pearson")
d <- as.dist(1-c)
hr <- hclust(d, method = "ward.D", members=NULL)
tryCatch({
obj@plotCollection[[plotname]] <- ggdendrogram(
hr,
rotate = TRUE,
size = 4,
theme_dendro = FALSE,
color = "tomato"
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
)
},
error = function(c) "Plot error",
warning = function(c) "warning",
message = function(c) "message"
)
return(obj)
}
)
## Done with dendrogram functionality ##
###############################################################################
###############################################################################
## Do differential gene expression ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="doDGEanalysis",
def=function(
obj,
DGEdesignCols = 'names(Obio@dfDesign)[grep("comp_", names(Obio@dfDesign)) ]',
createNewResultTable = TRUE,
normaliseAllSamplesTogether = FALSE
) {
#######################################################################
## Ensure result table slot is reset ##
if (createNewResultTable){
obj@DEseq2contrastTable <- data.frame(NULL)
}
## Done emptying past results ##
#######################################################################
#######################################################################
## DGE Analysis ##
if (length(DGEdesignCols) > 0){
for (i in 1:length(DGEdesignCols)){
if (obj@parameterList$batchMode){
selCols <- c("sample.id", "sample.group","replicate", DGEdesignCols[i])
designFormula <- as.formula("~ replicate + condition")
} else {
selCols <- c("sample.id", "sample.group", DGEdesignCols[i])
designFormula <- as.formula("~ condition")
}
colData = unique(obj@dfDesign[, selCols])
rownames(colData) = as.vector(colData$sample.id)
colnames(colData)[1] = "condition"
colData[,1] <- colData[,DGEdesignCols[i]]
if (obj@parameterList$batchMode){
colData$replicate <- as.factor(colData$replicate)
}
if (!normaliseAllSamplesTogether) {
colData = droplevels(data.frame(colData[colData$condition != "",]))
colData <- colData[order(colData$condition),]
} else {
colData[colData$condition == "", "condition"] <- "rest"
}
colData[,"condition"] = as.factor(colData[,"condition"])
colData$sample.group <- as.factor(colData$sample.group)
#colData$sample.group <- as.factor(colData$sample.group)
#Remove superflous rows from colData
## Extract order for col names ##
contrasts = sort(unique(obj@dfDesign[,DGEdesignCols[i]]), decreasing = FALSE)
contrasts = contrasts[contrasts != ""]
## Remove order suffix
colData$condition <- gsub("^1_", "", colData$condition)
colData$condition <- gsub("^2_", "", colData$condition)
contrasts <- gsub("^1_", "", contrasts)
contrasts <- gsub("^2_", "", contrasts)
#Create contrast vector
#contrast.vector = c([condition],[1_diff.gene set, e.g. mt],[2_baseline, e.g. wt])
#if (contrasts[2] != "scr"){
# contrasts = rev(contrasts)
#}
sel.col = contrasts
contrast.vector = append("condition", contrasts)
colName = paste(contrasts, collapse = "_vs_")
if (normaliseAllSamplesTogether) {
raw.counts.temp = obj@RSEMcountMatrix
} else {
raw.counts.temp = obj@RSEMcountMatrix[,rownames(colData)]
}
## Make factor ##
colData$condition <- as.factor(colData$condition)
#colData$sample.group <- as.factor(colData$sample.group)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = designFormula
)
#dds$condition <- factor(dds$condition, levels=contrasts)
dds <- DESeq(
dds,
test = "Wald",
parallel = obj@parameterList$parallelProcessing,
betaPrior = obj@parameterList$DEseq2betaPrior
)
res <- results(dds, contrast = contrast.vector)
#https://support.bioconductor.org/p/83773/
#res <- results(dds, contrast=list("conditioncell_type_A","conditioncell_type_B"))
#Create MA plot
library(ggpubr)
library(ggplot2)
plotname <- paste0("MAplot_", colName)
tryCatch({
obj@plotCollection[[plotname]] <- ggmaplot(
res, main = plotname,
fdr = 0.05, fc = 4, size = 1,
palette = c("#B31B21", "#1465AC", "darkgray"),
genenames = as.vector(row.names(res)),
legend = "top", top = 5,
font.label = c("bold", 5),
font.legend = "bold",
font.main = "bold",
ggtheme = ggplot2::theme_minimal())+
theme(plot.title = element_text(hjust = 0.5),
panel.border = element_rect(colour = "black", fill=NA, size=1)
) + ylim(-10, 10)
# obj@plotCollection[[plotname]] = print(plotMA(res, main=colName))
},
error = function(c) "MA plot not produced due to X11 error",
warning = function(c) "warning",
message = function(c) "message"
)
#Identify most variable genes in the dataset
#Use sd(row)/mean(row)
#Create PCA plot based on the most variable genes in the dataset
#######################################################################
#Continue with the differential gene expression analysis
## reference https://support.bioconductor.org/p/95695/
if (obj@parameterList$DEseq2betaPrior == FALSE) {
res <- lfcShrink(dds, coef="log2FoldChange", type="apeglm")
}
res = data.frame(res)
names(res) = paste(names(res), colName, sep="_")
res[[obj@parameterList$primaryAlignmentGeneID]] = rownames(res)
names(res) = gsub("log2FoldChange", "logFC", names(res))
names(res) = gsub(
"logFC",
paste("contrast_", i, "_logFC", sep=""),
names(res)
)
names(res) = gsub(
"padj",
paste("contrast_", i, "_padj", sep=""),
names(res)
)
names(res) = gsub(
"stat",
paste("contrast_", i, "_stat", sep=""),
names(res)
)
res$baseMean <- log2(res$baseMean)
names(res) = gsub(
"baseMean",
paste("contrast_", i, "_lg2BaseMean", sep=""),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
obj@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numeric ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
###############################################################
## lg10p 0.00 > 0.001 ##
# lg10pCol <- names(res)[grep("lg10p", names(res))]
# logFCcol <- names(res)[grep("logFC", names(res))]
#
# res[,lg10pCol] <- res[,lg10pCol] + 0.001
# res[res[,logFCcol] == 0, lg10pCol] <- 0
## Done ##
###############################################################
###############################################################
## Make diagnostic Volcano plot ##
dfVplot <- res
dfVplot[["Significance"]] <- "NS"
dfVplot[dfVplot[,grep("padj", names(dfVplot))] < 0.05 & dfVplot[,grep("logFC", names(dfVplot))] > 2, "Significance"] <- "Up"
dfVplot[dfVplot[,grep("padj", names(dfVplot))] < 0.05 & dfVplot[,grep("logFC", names(dfVplot))] < -2, "Significance"] <- "Down"
nrow(dfVplot[dfVplot$Significance == "Up",])
nrow(dfVplot[dfVplot$Significance == "Down",])
dsize <- 1
alpha <- I(0.5)
shape <- 21
tryCatch({
library(ggplot2)
plotname <- paste0("Volcano_Plot_", colName)
obj@plotCollection[[plotname]] <- ggplot(
data=dfVplot,
aes_string(
x=names(res)[grep("logFC", names(res))],
y=names(res)[grep("lg10p", names(res))],
fill = "Significance", alpha = alpha
)
) + geom_vline(xintercept = 0, color = "black", size=0.5
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + geom_vline(xintercept = c(-2,2), color = "red", size=0.5,linetype = 2
) + geom_hline(yintercept = c(1.3), color = "red", size=0.5,linetype = 2
) + geom_point(shape=shape
) + labs(title = plotname, y = "-log10(padjust)"
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12),
panel.grid.minor = element_blank()
) + scale_x_continuous(breaks=c(-2,2,seq(-30,30,5))
) + scale_y_continuous(breaks=c(seq(0,400,5))
) + scale_color_manual(values=c("black", "black", "black")
) + scale_fill_manual(values=c("blue", "grey", "red")
) + guides(color = FALSE
)
#obj@plotCollection[[plotname]] = print(plotMA(res, main=colName))
},
error = function(c) "MA plot not produced due to X11 error",
warning = function(c) "warning",
message = function(c) "message"
)
## Done diagnostic Volcano plot ##
###############################################################
## Add to result array ##
if (nrow(obj@DEseq2contrastTable) == 0){
obj@DEseq2contrastTable <- res
} else {
obj@DEseq2contrastTable <- merge(
obj@DEseq2contrastTable,
res,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID,
all = TRUE
)
obj@DEseq2contrastTable[is.na(obj@DEseq2contrastTable)] <- 0
}
}
####################################
## End for loop for DGE ####
####################################
} ## Ed DGE
return(obj)
}
)
## Done LRT/DGE method ##
###############################################################################
###############################################################################
## Do LRT analysis ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="doLRTanalysis",
def=function(
obj,
LRTdesignCols = 'names(Obio@dfDesign)[grep("LRT_", names(Obio@dfDesign)) ]',
createNewResultTable = TRUE
) {
#######################################################################
## Ensure result table slot is reset ##
if (createNewResultTable){
obj@DEseq2LRTtable <- data.frame(NULL)
}
## Done emptying past results ##
#######################################################################
if (length(LRTdesignCols) > 0){
for (i in 1:length(LRTdesignCols)){
if (obj@parameterList$batchMode){
selCols <- c("sample.id", "sample.group","replicate", LRTdesignCols[i])
designFormula <- as.formula("~ replicate + condition")
} else {
selCols <- c("sample.id", "sample.group", LRTdesignCols[i])
designFormula <- as.formula("~ condition")
}
colData = unique(obj@dfDesign[, selCols])
rownames(colData) = as.vector(colData$sample.id)
colnames(colData)[1] = "condition"
colData[,1] = as.factor(colData[,LRTdesignCols[i]])
colData$sample.group <- as.factor(colData$sample.group)
#Remove superflous rows from colData
colData = droplevels(data.frame(colData[colData$condition != "",]))
colData <- colData[order(colData$condition),]
## Extract order for col names ##
contrasts = sort(unique(obj@dfDesign[,LRTdesignCols[i]]), decreasing = FALSE)
contrasts = contrasts[contrasts != ""]
## Remove order suffix
colData$condition <- gsub("^1_", "", colData$condition)
colData$condition <- gsub("^2_", "", colData$condition)
contrasts <- gsub("^1_", "", contrasts)
contrasts <- gsub("^2_", "", contrasts)
#Create contrast vector
#contrast.vector = c([condition],[1_diff.gene set, e.g. mt],[2_baseline, e.g. wt])
#if (contrasts[2] != "scr"){
# contrasts = rev(contrasts)
#}
sel.col = contrasts
contrast.vector = append("condition", contrasts)
colName = paste(contrasts, collapse = "_vs_")
raw.counts.temp = obj@RSEMcountMatrix[,rownames(colData)]
## Make factor ##
colData$condition <- as.factor(colData$condition)
colData$sample.group <- as.factor(colData$sample.group)
if (obj@parameterList$batchMode){
colData$replicate <- as.factor(colData$replicate)
}
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = designFormula
)
## Perform LRT ##
if (obj@parameterList$batchMode){
reducedFormula <- as.formula("~ replicate")
} else {
reducedFormula <- as.formula("~ 1")
}
dds <- DESeq(
dds,
test = "LRT",
parallel = obj@parameterList$parallelProcessing,
reduced = reducedFormula
)
res <- results(dds)
###############################################################
## Add result to result collection ##
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[obj@parameterList$primaryAlignmentGeneID]] = rownames(res)
res$stat <- NULL
res$baseMean <- log2(res$baseMean)
names(res) = gsub(
"baseMean",
paste0("contrast_L_lg2BaseMean_", LRTdesignCols[i]),
names(res)
)
names(res) = gsub(
"padj",
paste0("contrast_L_padj_", LRTdesignCols[i]),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
obj@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numierc ##
## Make all numeric columns numierc ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
###############################################################
## lg10p 0.00 > 0.001 ##
# lg10pCol <- names(res)[grep("lg10p", names(res))]
# logFCcol <- names(res)[grep("logFC", names(res))]
#
# res[,lg10pCol] <- res[,lg10pCol] + 0.001
# res[res[,logFCcol] == 0, lg10pCol] <- 0
## Done ##
###############################################################
## Add to result array ##
if (nrow(obj@DEseq2LRTtable) == 0){
obj@DEseq2LRTtable <- res
} else {
obj@DEseq2LRTtable <- merge(
obj@DEseq2LRTtable,
res,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID,
all = TRUE
)
obj@DEseq2LRTtable[is.na(obj@DEseq2LRTtable)] <- 0
}
## Done adding to result collection ##
###############################################################
}
####################################
## End for loop for LRT ####
####################################
}
## Done with LRT Analysis ##
#######################################################################
return(obj)
}
)
## Done LRT method ##
###############################################################################
###############################################################################
## Do differential gene expression ##
#debug <- as.vector(NULL, mode = "character")
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="DGEanalysis",
def=function(
obj,
createNewResultTable = TRUE
) {
#######################################################################
## Start with DGE analysis ##
#obj@dfModel$comparison <- levels(droplevels(obj@dfModel$comparison))
dfDGE <- obj@dfModel
dfDGE <- dfDGE[dfDGE$test == "Wald",]
#######################################################################
## Ensure result table slot is reset ##
if (createNewResultTable){
pos <- c(
grep("MAplot", names(obj@plotCollection)),
grep("Volcano", names(obj@plotCollection))
)
if (length(pos) > 0){
obj@plotCollection <- obj@plotCollection[-pos]
}
obj@DEseq2contrastTable <- data.frame(NULL)
}
## Done emptying past results ##
#######################################################################
###########################################################################
## Begin DEseq2 block ##
if (nrow(dfDGE) > 0){
for (i in 1:nrow(dfDGE)){
designFormula <- as.formula(as.vector(dfDGE[i, "model"]))
addCols <- as.vector(dfDGE[i, "model"])
addCols <- gsub("~", "", addCols)
addCols <- unlist(strsplit(addCols, "[+]"))
addCols <- gsub(" ", "", addCols)
addCols <- addCols[addCols != "condition"]
selCols <- c("sample.id", as.vector(dfDGE[i,"comparisonID"]), addCols,"sample.group")
colData = unique(obj@dfDesign[, selCols])
names(colData) <- gsub(as.vector(paste0("^",dfDGE[i,"comparisonID"], "$")), "condition", names(colData))
rownames(colData) = as.vector(colData$sample.id)
colData$sample.id <- NULL
if (!as.vector(dfDGE[i,"normalizeAllSamplesTogether"])) {
colData = droplevels(data.frame(colData[colData$condition != "",]))
colData <- colData[order(colData$condition),]
} else {
colData[colData$condition == "", "condition"] <- "rest"
}
contrasts = sort(as.vector(unique(colData[,"condition"])), decreasing = FALSE)
contrasts = contrasts[contrasts != "rest"]
contrasts = contrasts[contrasts != ""]
colData$condition <- gsub("^1_", "", colData$condition)
colData$condition <- gsub("^2_", "", colData$condition)
fCols <- c("condition", addCols)
for (j in 1:length(fCols)){
colData[,fCols[j]] <- as.factor(colData[,fCols[j]])
}
colData[,"condition"] = as.factor(colData[,"condition"])
colData$sample.group <- as.factor(colData$sample.group)
contrasts <- gsub("^1_", "", contrasts)
contrasts <- gsub("^2_", "", contrasts)
#Create contrast vector
#contrast.vector = c([condition],[1_diff.gene set, e.g. mt],[2_baseline, e.g. wt])
#if (contrasts[2] != "scr"){
# contrasts = rev(contrasts)
#}
sel.col = contrasts
contrast.vector = append("condition", contrasts)
colName = paste(contrasts, collapse = "_vs_")
colName
if (as.vector(dfDGE[i,"normalizeAllSamplesTogether"])) {
raw.counts.temp = obj@RSEMcountMatrix
} else {
raw.counts.temp = obj@RSEMcountMatrix[,rownames(colData)]
}
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = designFormula
)
#dds$condition <- factor(dds$condition, levels=contrasts)
if (as.vector(dfDGE[i, "betaPrior"]) == "TRUE"){
betaPrior <- TRUE
} else {
betaPrior <- FALSE
}
dds <- DESeq(
dds,
test = as.vector(dfDGE[i, "test"]),
parallel = obj@parameterList$parallelProcessing,
betaPrior = betaPrior
)
res <- results(dds, contrast = contrast.vector)
#https://support.bioconductor.org/p/83773/
#res <- results(dds, contrast=list("conditioncell_type_A","conditioncell_type_B"))
##################################
##################################
## Inactivate plot routine here ##
## 20210523 ##
##################################
##################################
#Create MA plot
library(ggpubr)
library(ggplot2)
plotname <- paste0("MAplot_", colName)
## Debug
# debug <- c(
# debug,
# plotname
# )
##
tryCatch({
obj@plotCollection[[plotname]] <- ggmaplot(
res, main = plotname,
fdr = 0.05, fc = 4, size = 1,
palette = c("#B31B21", "#1465AC", "darkgray"),
genenames = as.vector(row.names(res)),
legend = "top", top = 5,
font.label = c("bold", 5),
font.legend = "bold",
font.main = "bold",
ggtheme = ggplot2::theme_minimal())+
theme(plot.title = element_text(hjust = 0.5),
panel.border = element_rect(colour = "black", fill=NA, size=1)
) + ylim(-10, 10)
# obj@plotCollection[[plotname]] = print(plotMA(res, main=colName))
},
error = function(c) "MA plot not produced due to X11 error",
warning = function(c) "warning",
message = function(c) "message"
)
######################################
######################################
## Inactivate plot routine here end ##
## 20210523 ##
######################################
######################################
#Identify most variable genes in the dataset
#Use sd(row)/mean(row)
#Create PCA plot based on the most variable genes in the dataset
#######################################################################
#Continue with the differential gene expression analysis
## reference https://support.bioconductor.org/p/95695/
if (obj@parameterList$DEseq2betaPrior == FALSE) {
res <- lfcShrink(dds, coef="log2FoldChange", type="apeglm")
}
res = data.frame(res)
names(res) = paste(names(res), colName, sep="_")
res[[obj@parameterList$primaryAlignmentGeneID]] = rownames(res)
names(res) = gsub("log2FoldChange", "logFC", names(res))
names(res) = gsub(
"logFC",
paste("contrast_", i, "_logFC", sep=""),
names(res)
)
names(res) = gsub(
"padj",
paste("contrast_", i, "_padj", sep=""),
names(res)
)
names(res) = gsub(
"stat",
paste("contrast_", i, "_stat", sep=""),
names(res)
)
res$baseMean <- log2(res$baseMean)
names(res) = gsub(
"baseMean",
paste("contrast_", i, "_lg2BaseMean", sep=""),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
obj@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numeric ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
###############################################################
## lg10p 0.00 > 0.001 ##
# lg10pCol <- names(res)[grep("lg10p", names(res))]
# logFCcol <- names(res)[grep("logFC", names(res))]
#
# res[,lg10pCol] <- res[,lg10pCol] + 0.001
# res[res[,logFCcol] == 0, lg10pCol] <- 0
## Done ##
###############################################################
###############################################################
## Make diagnostic Volcano plot ##
dfVplot <- res
dfVplot[["Significance"]] <- "NS"
dfVplot[dfVplot[,grep("padj", names(dfVplot))] < 0.05 & dfVplot[,grep("logFC", names(dfVplot))] > 2, "Significance"] <- "Up"
dfVplot[dfVplot[,grep("padj", names(dfVplot))] < 0.05 & dfVplot[,grep("logFC", names(dfVplot))] < -2, "Significance"] <- "Down"
nrow(dfVplot[dfVplot$Significance == "Up",])
nrow(dfVplot[dfVplot$Significance == "Down",])
dsize <- 1
alpha <- I(0.5)
shape <- 21
tryCatch({
library(ggplot2)
plotname <- paste0("Volcano_Plot_", colName)
obj@plotCollection[[plotname]] <- ggplot(
data=dfVplot,
aes_string(
x=names(res)[grep("logFC", names(res))],
y=names(res)[grep("lg10p", names(res))],
fill = "Significance", alpha = alpha
)
) + geom_vline(xintercept = 0, color = "black", size=0.5
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + geom_vline(xintercept = c(-2,2), color = "red", size=0.5,linetype = 2
) + geom_hline(yintercept = c(1.3), color = "red", size=0.5,linetype = 2
) + geom_point(shape=shape
) + labs(title = plotname, y = "-log10(padjust)"
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12),
panel.grid.minor = element_blank()
) + scale_x_continuous(breaks=c(-2,2,seq(-30,30,5))
) + scale_y_continuous(breaks=c(seq(0,400,5))
) + scale_color_manual(values=c("black", "black", "black")
) + scale_fill_manual(values=c("blue", "grey", "red")
) + guides(color = FALSE
)
#obj@plotCollection[[plotname]] = print(plotMA(res, main=colName))
},
error = function(c) "MA plot not produced due to X11 error",
warning = function(c) "warning",
message = function(c) "message"
)
## Done diagnostic Volcano plot ##
###############################################################
## Add to result array ##
if (nrow(obj@DEseq2contrastTable) == 0){
obj@DEseq2contrastTable <- res
} else {
obj@DEseq2contrastTable <- merge(
obj@DEseq2contrastTable,
res,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID,
all = TRUE
)
obj@DEseq2contrastTable[is.na(obj@DEseq2contrastTable)] <- 0
}
}
}
## End DEseq2 analysis ##
#######################################################################
return(obj)
}
)
## Done LRT/DGE method ##
###############################################################################
###############################################################################
## Do LRT analysis ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="LRTanalysis",
def=function(
obj,
createNewResultTable = TRUE
) {
#######################################################################
## Ensure result table slot is reset ##
if (createNewResultTable){
obj@DEseq2LRTtable <- data.frame(NULL)
}
## Done emptying past results ##
#######################################################################
dfDGE <- obj@dfModel
dfDGE <- dfDGE[dfDGE$test == "LRT",]
if (nrow(dfDGE) > 0){
for (i in 1:nrow(dfDGE)){
designFormula <- as.formula(as.vector(dfDGE[i, "model"]))
addCols <- as.vector(dfDGE[i, "model"])
addCols <- gsub("~", "", addCols)
addCols <- unlist(strsplit(addCols, "[+]"))
addCols <- gsub(" ", "", addCols)
addCols <- addCols[addCols != "condition"]
selCols <- c("sample.id", as.vector(dfDGE[i,"comparisonID"]), addCols,"sample.group")
colData = unique(obj@dfDesign[, selCols])
names(colData) <- gsub(as.vector(paste0("^",dfDGE[i,"comparisonID"], "$")), "condition", names(colData))
rownames(colData) = as.vector(colData$sample.id)
colData$sample.id <- NULL
if (!as.vector(dfDGE[i,"normalizeAllSamplesTogether"])) {
colData = droplevels(data.frame(colData[colData$condition != "",]))
colData <- colData[order(colData$condition),]
} else {
colData[colData$condition == "", "condition"] <- "rest"
}
colData$condition <- gsub("^1_", "", colData$condition)
colData$condition <- gsub("^2_", "", colData$condition)
fCols <- c("condition", addCols)
for (j in 1:length(fCols)){
colData[,fCols[j]] <- as.factor(colData[,fCols[j]])
}
colData[,"condition"] = as.factor(colData[,"condition"])
colData$sample.group <- as.factor(colData$sample.group)
contrasts = sort(as.vector(unique(colData[,"condition"])), decreasing = FALSE)
contrasts = contrasts[contrasts != "rest"]
contrasts = contrasts[contrasts != ""]
contrasts <- gsub("^1_", "", contrasts)
contrasts <- gsub("^2_", "", contrasts)
#Create contrast vector
#contrast.vector = c([condition],[1_diff.gene set, e.g. mt],[2_baseline, e.g. wt])
#if (contrasts[2] != "scr"){
# contrasts = rev(contrasts)
#}
sel.col = contrasts
contrast.vector = append("condition", contrasts)
colName = paste(contrasts, collapse = "_vs_")
if (as.vector(dfDGE[i,"normalizeAllSamplesTogether"])) {
raw.counts.temp = obj@RSEMcountMatrix
} else {
raw.counts.temp = obj@RSEMcountMatrix[,rownames(colData)]
}
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = designFormula
)
dds <- DESeq(
dds,
test = as.vector(dfDGE[i,"test"]),
parallel = obj@parameterList$parallelProcessing,
reduced = as.formula(as.vector(dfDGE[i, "reducedModel"]))
)
res <- results(dds)
###############################################################
## Add result to result collection ##
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[obj@parameterList$primaryAlignmentGeneID]] = rownames(res)
res$stat <- NULL
res$baseMean <- log2(res$baseMean)
names(res) = gsub(
"baseMean",
paste0("contrast_L_lg2BaseMean_", as.vector(dfDGE[i, "comparison"])),
names(res)
)
names(res) = gsub(
"padj",
paste0("contrast_L_padj_", as.vector(dfDGE[i, "comparison"])),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
res[,padj.col] <- as.numeric(res[,padj.col])
## Add log10p column ##
padj <- names(res)[grep("_padj_", names(res))]
lg10p <- gsub("padj", "lg10p", padj)
for (z in 1:length(padj)){
preprocess <- as.numeric(res[,padj[z]])
minNum <- min(preprocess[preprocess != 0])
preprocess[preprocess == 0] <- minNum
# if (length(grep("padj_LRT", padj[i])) > 0){
# preprocess <- as.numeric(res[,padj[z]])
# minNum <- min(preprocess[preprocess != 0])
# preprocess[preprocess == 0] <- minNum
# } else {
# preprocess <- as.numeric(res[,padj[z]])
# }
temp <- -1*log10(preprocess)
#temp[temp >= 50] = 50
res[,lg10p[z]] <- temp
}
col.vector = c(
obj@parameterList$primaryAlignmentGeneID,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
## Make all numeric columns numierc ##
## Make all numeric columns numierc ##
res[,grep("contrast_", names(res))] <- apply(res[,grep("contrast_", names(res))], 2, as.numeric)
###############################################################
## lg10p 0.00 > 0.001 ##
# lg10pCol <- names(res)[grep("lg10p", names(res))]
# logFCcol <- names(res)[grep("logFC", names(res))]
#
# res[,lg10pCol] <- res[,lg10pCol] + 0.001
# res[res[,logFCcol] == 0, lg10pCol] <- 0
## Done ##
###############################################################
## Add to result array ##
if (nrow(obj@DEseq2LRTtable) == 0){
obj@DEseq2LRTtable <- res
} else {
obj@DEseq2LRTtable <- merge(
obj@DEseq2LRTtable,
res,
by.x = obj@parameterList$primaryAlignmentGeneID,
by.y = obj@parameterList$primaryAlignmentGeneID,
all = TRUE
)
obj@DEseq2LRTtable[is.na(obj@DEseq2LRTtable)] <- 0
}
## Done adding to result collection ##
###############################################################
}
####################################
## End for loop for LRT ####
####################################
}
## Done with LRT Analysis ##
#######################################################################
return(obj)
}
)
## Done LRT method ##
###############################################################################
###############################################################################
# (8c) do.differential.expresion.analysis #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
do.differential.expression.analyis <- function(
raw.counts.filt = raw.counts.filt, #count data filename
DEseq2Dir = paste0(localWorkDir, "DESeq2"), #directory for results
df.design = df.design, #df.design
gene.id = "ENSMUSG", #primary gene id after alignment
batch.mode = FALSE,
#if true, df.design needs to contain a 'replicate' column
parallel = FALSE,
toBeNormalized = TRUE, # TRUE if the dataset is to be normalized # False if a normalized matrix is provided
plotOutputDir = "",
doPCA = TRUE,
writePlotsToFile = TRUE,
timeseries = FALSE,
tempShellScriptVector = as.vector(NULL, mode = "character")
) {
#######################################
#Differential gene expresion in DESEQ2#
#######################################
library(DESeq2)
tempShellScriptVector <- c(
tempShellScriptVector,
paste0(capture.output(sessionInfo()), " \n")
)
#Reduce df.design to one row per sample; getting rid of separate R1/R2 rows
sel.vec <- names(df.design)[grep("comp_", names(df.design))]
if (length(grep("^FASTQ$", names(df.design))) == 0){
df.design[["FASTQ"]] <- df.design$sample.id
}
sel.vec <- c(
sel.vec,
"FASTQ",
"sample.id",
"sample.group",
"dataseries",
"replicate",
names(df.design)[grep("^LRT_", names(df.design))]
)
# if (batch.mode){
# sel.vec <- c(
# sel.vec,
# "replicate"
# )
#
# }
## Add timeseries parameter ##
if (timeseries){
sel.vec <- c(
sel.vec,
"timepoint"
)
}
rmVec <- grep("LRT_", sel.vec)
if (length(rmVec) > 0){
sel.vec <- sel.vec[-rmVec]
}
df.design <- unique(
df.design[,sel.vec]
)
comparisons = names(df.design)[grep("comp_", names(df.design))]
if (length(grep("DESeq2", list.dirs())) == 0){
dir.create(DEseq2Dir)
}
setwd(DEseq2Dir)
## Ensure that replicate is set ##
if (!batch.mode) {
df.design[["replicate"]] = paste0("B", df.design$sample.id)
}
###########################################################################
## Perform sample group LRT ##
if (length())
#Evaluate results
res <- results(dds)
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[gene.id]] = rownames(res)
names(res) = gsub("log2FoldChange", "LRT_logFC", names(res))
res$LRT_logFC <- NULL
res$stat <- NULL
names(res) = gsub(
"padj",
"contrast_G_padj_LRTsampleGroup",
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
fn = paste("DESeq2.results.LRTsampleGroup.txt", sep="")
#Select columns to carry forward
col.vector = gene.id
col.vector = append(
col.vector,
names(res)[grep("LRT", names(res))]
)
res = res[,col.vector]
write.table(res, fn, row.names=FALSE, sep="\t")
## Done with sample LRT ##
###########################################################################
###########################################################################
## Perform replicate LRT ##
## Done with replicate LRT ##
###########################################################################
###########################################################################
## Perform data-series LRT ##
if (length(unique(df.design$dataseries)) > 1){
if (batch.mode){
colData = df.design[, c("sample.id", "dataseries","replicate")]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ replicate + condition
)
## This will only take the condition parameter into account ##
## This will answer to the question if any samples are different ##
dds <- DESeq(
dds,
test = "LRT",
parallel = parallel,
reduced = ~ replicate
)
} else {
colData = df.design[, c("sample.id", "dataseries")]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
## This will now determine if the difference is explained by
## condition
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ condition
)
## This will only take the condition parameter into account ##
## This will answer to the question if any samples are different ##
dds <- DESeq(
dds,
test = "LRT",
parallel = parallel,
reduced = ~1
)
}
#Evaluate results
res <- results(dds)
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[gene.id]] = rownames(res)
names(res) = gsub("log2FoldChange", "LRT_logFC", names(res))
res$LRT_logFC <- NULL
names(res) = gsub(
"padj",
"contrast_D_padj_LRTdataseries",
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
fn = paste("DESeq2.results.LRTdataseries.txt", sep="")
#Select columns to carry forward
col.vector = gene.id
col.vector = append(
col.vector,
names(res)[grep("LRT", names(res))]
)
res = res[,col.vector]
write.table(res, fn, row.names=FALSE, sep="\t")
}
## Done performing data-series_LRT
###########################################################################
###########################################################################
## Perform custon LRT (dfDesign "LRT_columns") ##
LRTcols <- names(df.design)[grep("^LRT_", names(df.design))]
if (length(LRTcols) > 0){
for (k in 1:length(LRTcols)){
if (batch.mode){
colData = df.design[, c("sample.id", LRTcols[k],"replicate")]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ replicate + condition
)
## This will only take the condition parameter into account ##
## This will answer to the question if any samples are different ##
dds <- DESeq(
dds,
test = "LRT",
parallel = parallel,
reduced = ~ replicate
)
} else {
colData = df.design[, c("sample.id", LRTcols[k])]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
## This will now determine if the difference is explained by
## condition
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ condition
)
## This will only take the condition parameter into account ##
## This will answer to the question if any samples are different ##
dds <- DESeq(
dds,
test = "LRT",
parallel = parallel,
reduced = ~1
)
}
#Evaluate results
res <- results(dds)
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[gene.id]] = rownames(res)
names(res) = gsub("log2FoldChange", "LRT_logFC", names(res))
res$LRT_logFC <- NULL
res$stat <- NULL
names(res) = gsub(
"padj",
paste0("contrast_L_padj_", LRTcols[k]),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
fn = paste("DESeq2.results.", LRTcols[k],".txt", sep="")
#Select columns to carry forward
col.vector = gene.id
col.vector = append(
col.vector,
names(res)[grep("LRT", names(res))]
)
res = res[,col.vector]
write.table(res, fn, row.names=FALSE, sep="\t")
}
}
## Done with sample LRT ##
###########################################################################
###########################################################################
## Perform timecourse LRT ##
if (timeseries){
if (batch.mode){
colData = df.design[, c("data.series","sample.id", "sample.group","replicate", "timepoint")]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colnames(colData)[1] = "condition"
colData$condition <- as.factor(colData$condition)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ replicate + condition
)
} else {
colData = df.design[, c("sample.id", "sample.group", "dataseries","timepoint")]
rownames(colData) = as.vector(df.design$sample.id)
colData$sample.id <- NULL
colData$dataseries <- as.factor(colData$dataseries)
colData$timepoint <- as.factor(colData$timepoint)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.filt,
colData = colData,
design = ~ dataseries + timepoint +dataseries:timepoint
)
}
## This will only take the condition parameter into account ##
dds <- DESeq(
dds,
test = "LRT",
parallel = parallel,
reduced = ~ dataseries + timepoint
)
#Evaluate results
res <- results(dds)
#Continue with the differential gene expression analysis
res = data.frame(res)
res[[gene.id]] = rownames(res)
names(res) = gsub("log2FoldChange", "contrast_0_logFC_timecourseLRT", names(res))
names(res) = gsub(
"padj",
"contrast_0_padj_timecourseLRT",
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
fn = paste("DESeq2.results.timecourseLRT.txt", sep="")
#Select columns to carry forward
col.vector = gene.id
col.vector = append(
col.vector,
names(res)[grep("LRT", names(res))]
)
res = res[,col.vector]
write.table(res, fn, row.names=FALSE, sep="\t")
}
## Done timecourse LRT ##
###########################################################################
tempShellScriptVector <- c(
tempShellScriptVector,
'\n',
'################################################################################',
'\n',
'## R-commands used for differential gene expression analysis: ##',
'\n'
)
#Create individual comparisons after subsetting the raw counts table
######################
## Begin for loop ##
for (i in 1:length(comparisons)){
#Create col data data frame
if (batch.mode){
colData = df.design[, c("sample.id", "sample.group","replicate")]
rownames(colData) = as.vector(df.design$sample.id)
colnames(colData)[1] = "condition"
#colnames(colData)[2] = "replicate"
colData[,1] = as.factor(df.design[,comparisons[i]])
} else {
colData = df.design[, c("sample.id", "sample.group")]
rownames(colData) = as.vector(df.design$sample.id)
colnames(colData)[1] = "condition"
colData[,1] = as.factor(df.design[,comparisons[i]])
colData$sample.group <- as.factor(colData$sample.group)
}
#Remove superflous rows from colData
colData = droplevels(data.frame(colData[colData$condition != "",]))
#colData$sample.group = NULL
#Bring coldata rows in the same order as raw.counts.temp columns
#colData = data.frame(colData[colnames(raw.counts.temp),])
#colnames(colData)[1] = "condition"
## Order colData according to conditon
## e.g. 1_, 2_ if 1_ and 2_ prefix is set in condx column##
colData <- colData[order(colData$condition),]
## Extract order for col names ##
contrasts = sort(unique(df.design[,comparisons[i]]), decreasing = FALSE)
contrasts = contrasts[contrasts != ""]
## Remove order suffix
colData$condition <- gsub("^1_", "", colData$condition)
colData$condition <- gsub("^2_", "", colData$condition)
contrasts <- gsub("^1_", "", contrasts)
contrasts <- gsub("^2_", "", contrasts)
#Create contrast vector
#contrast.vector = c([condition],[1_diff.gene set, e.g. mt],[2_baseline, e.g. wt])
#if (contrasts[2] != "scr"){
# contrasts = rev(contrasts)
#}
sel.col = contrasts
contrast.vector = append("condition", contrasts)
colName = paste(contrasts, collapse = "_vs_")
raw.counts.temp = raw.counts.filt[,rownames(colData)]
## Make factor ##
colData$condition <- as.factor(colData$condition)
colData$sample.group <- as.factor(colData$sample.group)
if (batch.mode){
colData$replicate <- as.factor(colData$replicate)
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = ~ replicate + condition
)
## Documentation ##
colDataVec <- as.vector(NULL, mode = "character")
colDataVec <- c(
colDataVec,
'\n',
paste0(c("sample.id",colnames(colData), collapse = "\t")),
'\n'
)
for (m in 1:nrow(colData)){
colDataVec <- c(
colDataVec,
'\n',
paste0(c(row.names(colData)[m],as.vector(t(colData[m,]))), collapse = "\t"),
'\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'\n',
'################################################################################',
'\n',
paste0("## DGE comparison ", colName, " ##"),
'\n',
"dds <- DESeqDataSetFromMatrix(",
'\n',
" countData = raw.counts.temp",
'\n',
" colData = colData",
'\n',
" design = ~ replicate + condition",
'\n',
")",
'\n',
'\n',
"DGE Input Matrix",
colDataVec,
'\n',
'dds <- DESeq(dds)',
'\n',
'res <- results(dds, contrast = contrast.vector)',
'\n'
)
} else {
dds <- DESeqDataSetFromMatrix(
countData = raw.counts.temp,
colData = colData,
design = ~ condition
)
## Documentation ##
colDataVec <- as.vector(NULL, mode = "character")
colDataVec <- c(
colDataVec,
'\n',
paste0(c("sample.id",colnames(colData), collapse = "\t")),
'\n'
)
for (m in 1:nrow(colData)){
colDataVec <- c(
colDataVec,
paste0(c(row.names(colData)[m],as.vector(t(colData[m,]))), collapse = "\t"),
'\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'\n',
paste0("DGE comparison ", comparisons[i]),
'\n',
"dds <- DESeqDataSetFromMatrix(",
'\n',
" countData = raw.counts.temp",
'\n',
" colData = colData",
'\n',
" design = ~ replicate + condition",
'\n',
")",
'\n',
'\n',
"DGE Input Matrix:",
'\n',
colDataVec,
'\n',
'dds <- DESeq(dds)',
'\n',
'res <- results(dds, contrast = contrast.vector)',
'\n'
)
}
#dds$condition <- factor(dds$condition, levels=contrasts)
dds <- DESeq(dds)
res <- results(dds, contrast = contrast.vector)
#Create MA plot
fn = paste(plotOutputDir, "MA.plot.", comparisons[i], ".png", sep="")
if (writePlotsToFile){
png(fn, type="cairo")
print(plotMA(res, main="DESeq2", ylim=c(-2,2)))
} else {
plotMA(res, main="DESeq2", ylim=c(-2,2))
}
if (writePlotsToFile){
dev.off()
}
#Identify most variable genes in the dataset
#Use sd(row)/mean(row)
#Create PCA plot based on the most variable genes in the dataset
if (doPCA){
if (length(unique(obj@dfDesign$sample.id)) > 42) {
rld <- vst(obj@ObjDds)
} else {
rld <- rlog(obj@ObjDds)
}
fn = paste(plotOutputDir,"PCA_plot.", comparisons[i], ".png", sep="")
png(fn, type="cairo")
print(plotPCA(rld, intgroup=c("condition")))
dev.off()
}
#######################################################################
#Continue with the differential gene expression analysis
res = data.frame(res)
names(res) = paste(names(res), colName, sep="_")
res[[gene.id]] = rownames(res)
names(res) = gsub("log2FoldChange", "logFC", names(res))
names(res) = gsub(
"logFC",
paste("contrast_", i, "_logFC", sep=""),
names(res)
)
names(res) = gsub(
"padj",
paste("contrast_", i, "_padj", sep=""),
names(res)
)
names(res) = gsub(
"stat",
paste("contrast_", i, "_stat", sep=""),
names(res)
)
#Remove all rows without a padj
padj.col = grep("padj", names(res))[1]
res[,padj.col][is.na(res[,padj.col])] = ""
res = res[res[,padj.col] != "", ]
fn = paste("DESeq2.results.", comparisons[i], ".txt", sep="")
#Select columns to carry forward
col.vector = gene.id
col.vector = append(
col.vector,
names(res)[grep("contrast", names(res))]
)
res = res[,col.vector]
write.table(res, fn, row.names=FALSE, sep="\t")
}
#########################
## End for loop ####
#########################
###############################################################################
#Preparing a single result table for all comparisons #
###############################################################################
setwd(DEseq2Dir)
## Add LRT ##
if (length(unique(df.design$dataseries)) > 1){
comparisons <- c(
comparisons,
"LRTsampleGroup",
"LRTdataseries",
names(df.design)[grep("LRT_", names(df.design))]
)
} else {
comparisons <- c(
comparisons,
"LRTsampleGroup",
names(df.design)[grep("LRT_", names(df.design))]
)
}
## Add timecourse LRT ##
if (timeseries){
comparisons <- c(
comparisons,
"timecourseLRT"
)
}
for (i in 1:length(comparisons)){
fn = paste("DESeq2.results.", comparisons[i], ".txt", sep="")
df.temp = read.delim(fn, header=TRUE, sep="\t")
if (i > 1){
df.summary = merge(
df.summary,
df.temp,
by.x = gene.id,
by.y = gene.id,
all=TRUE
)
} else {
df.summary = df.temp
}
}
# Set all na p values in df.summary to 1
p.val <- names(df.summary)[grep("padj", names(df.summary))]
for (k in 1:length(p.val)){
df.summary[is.na(df.summary[,p.val[k]]),p.val[k]] = 1
}
# Set all others to "
df.summary[is.na(df.summary)] = ""
#Prepare norm counts section
names(df.normCounts) = ifelse(
substr(
names(df.normCounts),
1,
1
) == "X",
substr(names(df.normCounts), 2, 1000),
names(df.normCounts)
)
names(df.normCounts) <- gsub("[.]", "_", names(df.normCounts))
names(df.normCounts) <- paste("norm_counts", names(df.normCounts), sep="_")
df.normCounts[[gene.id]] = row.names(df.normCounts)
#Remove all rows with 0 counts ##
df.summary = merge(
df.summary,
df.normCounts,
by.x=gene.id,
by.y=gene.id
)
df.summary[is.na(df.summary)] = ""
## Cleaning up unloading package and namespace to not interfere with
# Rmysql
detach("package:DESeq2", unload = TRUE)
unloadNamespace("DESeq2")
## Remove RSQLite DDI drivers ##
unloadNamespace("genefilter")
unloadNamespace("biomaRt")
unloadNamespace("geneplotter")
unloadNamespace("annotate")
unloadNamespace("AnnotationDbi")
unloadNamespace("RSQLite")
returnList <- list(
"df.summary" = df.summary,
"docuVector" = tempShellScriptVector
)
return(returnList)
}
## End of function ##
###############################################################################
## Done differential gene expression ##
###############################################################################
###############################################################################
## Calculate Coefficient of variation ##
## Assemble database table ##
# counts matrix
# DGE table
# LRT table
# RSEM counts
# Annotation
## Done calculating the coefficient of variation ##
###############################################################################
## Done initializing S4 object ##
###############################################################################
###############################################################################
## (0a) createConcatenateFASTQfilesShellScript ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createConcatenateFASTQfilesShellScript <- function(
dfConCat = "dataframe with SRR inut and SRX output file names"
){
## Determine need to concatenate ##
srxVec <- as.vector(unique(dfConCat$srxFASTQname))
srrVec <- as.vector(unique(dfConCat$srrFASTQname))
SRXvec <- as.vector(
unique(
dfConCat$srxFASTQname
)
)
## Create shell script ##
scriptVec <- as.vector(NULL, mode = "character")
scriptVec <- c(
scriptVec,
"#!/bin/sh",
"\n"
)
for (i in 1:length(SRXvec)){
catFiles <- as.vector(
dfConCat[dfConCat$srxFASTQname == SRXvec[i], "srrFASTQname"]
)
if (length(catFiles) > 1){
catCMD <- paste0(
"cat ",
paste(catFiles, collapse = " "),
" > ",
SRXvec[i]
)
} else {
catCMD <- paste0(
"mv ",
catFiles[1],
" ",
SRXvec[i]
)
}
scriptVec <- c(
scriptVec,
catCMD,
"\n"
)
}
sink("concatFASTQfiles.sh")
for (i in 1:length(scriptVec)){
cat(scriptVec[i])
}
sink()
return(scriptVec)
}
## End: createConcatenateFASTQfilesShellScript ##
###############################################################################
###############################################################################
## (0b) organizeFastqFiles ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
## Function parameters ##
organizeFastqFiles <- function(
baseMount = 'gsub("boeings/", "", hpc.mount)',
pathToSeqStorageFolder = 'pathToSeqStorageFolder',
fastqOutputDir = 'fastqDir',
localWorkDir = 'localworkdir',
checkFileExists = TRUE
){
SRR <- unlist(sapply(
pathToSeqStorageFolder,
function(x) paste0(
x, list.files(x)
)
))
if (checkFileExists){
SRR <- unique(SRR[file.exists(SRR)])
} else {
SRR <- as.vector(unique(SRR))
}
original.NGS <- SRR
for (i in 1:length(pathToSeqStorageFolder)){
original.NGS <- gsub(pathToSeqStorageFolder[i], "", original.NGS)
}
original.NGS <- as.vector(
unique(
original.NGS
)
)
dfConCat <- data.frame(
SRR,
original.NGS,
stringsAsFactors = FALSE
)
dfConCat[["R"]] <- ""
dfConCat[grep("_R1_", as.vector(dfConCat$SRR)), "R"] <- "R1"
dfConCat[grep("_R2_", as.vector(dfConCat$SRR)), "R"] <- "R2"
dfConCat[["SRX"]] <- ""
dfConCat$SRX <- sapply(
as.vector(dfConCat$original.NGS),
function(x)
unlist(
strsplit(x, "_")
)[1]
)
dfConCat$SRX <- paste0(
fastqOutputDir,
dfConCat$SRX,
"_",
dfConCat$R,
".fastq.gz"
)
srrFASTQname <- dfConCat$SRR
srxFASTQname <- dfConCat$SRX
dfConCat <- unique(data.frame(srrFASTQname, srxFASTQname))
## If necessary, create concatenation script ##
shellScriptVector <- as.vector(NULL, mode = "character")
if (length(unique(dfConCat$srrFASTQname)) > length(unique(dfConCat$srxFASTQname))){
setwd(localWorkDir)
tempShellScriptVector <- createConcatenateFASTQfilesShellScript(
dfConCat = dfConCat
#paired.end = FALSE
)
shellScriptVector <- c(
shellScriptVector,
tempShellScriptVector
)
print("Remove end of line characters: tr -d '\r' <concatFASTQfiles.sh> conv.concatFASTQfiles.sh")
concatenationRequired <- TRUE
} else {
print("One file per sample - no cocatenation required.")
concatenationRequired <- FALSE
}
if (concatenationRequired){
print("Concatenation is required")
} else {
shellScriptVector <- "No concatenation required"
}
returnList <- list(
"shellScriptVector" = shellScriptVector,
"concatenationRequired" = concatenationRequired
)
return(returnList)
}
## (0b) organizeFastqFiles ##
###############################################################################
###############################################################################
## (40) completeDesignBasedOnSampleID() ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
## helper function ##
completeDesignBasedOnSampleID <- function(
dfBasedesign,
fastqDir = ""
){
## Create sample group ##
dfBasedesign[["sample.group"]] <- as.vector(
sapply(
as.vector(dfBasedesign$sample.id),
function(x) paste(
unlist(
strsplit(x, "_")
)[c(1:2)],
collapse = "_"
)
)
)
## Create dataseries ##
dfBasedesign[["dataseries"]] <- as.vector(sapply(
dfBasedesign$sample.group,
function(x) unlist(
strsplit(x, "_")
)[1]
))
## Create dataseries color ##
library(RColorBrewer)
nCol <- length(unique(dfBasedesign$dataseries))
selcol <- colorRampPalette(brewer.pal(9,"YlOrBr"))
groupCols <- selcol(nCol)
dfColor <- data.frame(unique(dfBasedesign$dataseries), groupCols)
names(dfColor) <- c("dataseries", "dataseries_color")
dfDesign <- merge(dfBasedesign, dfColor, by.x = "dataseries", by.y = "dataseries")
## Set original.NGS ##
names(dfDesign) <- gsub("srxFASTQname", "original.NGS", names(dfDesign))
## Set NGS ##
dfDesign[["NGS"]] <- paste0(
fastqDir,
dfDesign$sample.id,
"_R1.fastq.gz"
)
## Ensure R1/R2 consistency ##
if (length(grep("original.NGS", names(dfDesign))) > 0){
pos1 <- grep("R2.fastq.gz",dfDesign$original.NGS)
if (length(pos1) > 0){
dfDesign[pos1, "NGS"] <- gsub(
"R1.fastq.gz",
"R2.fastq.gz",
dfDesign[pos1, "NGS"]
)
}
}
return(dfDesign)
}
## End completeDesignBasedOnSampleID() ##
###############################################################################
###############################################################################
## (0c) createDesignFileCrickASFsamples ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createDesignFileCrickASFsamples <- function(
pathToSeqStorageFolder = 'pathToSeqStorageFolder',
FNsampleAnnotation = 'paste0(
S4LvariableListing$localWorkDir,
"sample.ids.txt")',
paired.end = 'paired.end',
fastqDir = 'fastqDir',
baseMount = ''
){
fastqBasefolders <- unlist(pathToSeqStorageFolder)
original.NGS <- as.vector(NULL, mode = "character")
original.NGS.FN <- as.vector(NULL, mode = "character")
for (i in 1:length(fastqBasefolders)){
original.NGS <- c(
original.NGS,
paste0(
fastqBasefolders[i],
list.files(
fastqBasefolders[i]
)
)
)
original.NGS.FN <- c(
original.NGS.FN,
paste0(
list.files(
fastqBasefolders[i]
)
)
)
}
df.fastq <- data.frame(
original.NGS,
original.NGS.FN,
stringsAsFactors = FALSE
)
df.fastq[["sampleID"]] <- sapply(
df.fastq$original.NGS.FN,
function(x)
unlist(
strsplit(
x, "_"
)
)[1]
)
df.sample.ids <- read.delim(
file = FNsampleAnnotation,
header = TRUE,
stringsAsFactors = FALSE
)
df.sample.ids <- unique(
merge(
df.sample.ids,
df.fastq,
by.x = "sampleID",
by.y = "sampleID"
)
)
dfDesign <- df.sample.ids
## sample ids ##
dfDesign <- completeDesignBasedOnSampleID(
dfBasedesign = dfDesign,
fastqDir = fastqDir
)
## Set NGS ##
dfDesign[["NGS"]] <- ""
dfDesign[grep("_R1.", dfDesign$original.NGS),"NGS"] <-
paste0(
fastqDir,
dfDesign[grep("_R1.", dfDesign$original.NGS),"sample.id"],
"_R1.fastq.gz"
)
if (paired.end){
dfDesign[grep("_R2.", dfDesign$original.NGS),"NGS"] <-
paste0(
fastqDir,
dfDesign[grep("_R2.", dfDesign$original.NGS),"sample.id"],
"_R2.fastq.gz"
)
}
## Set default dataseries colors ##
library(RColorBrewer)
selcol <- colorRampPalette(brewer.pal(9,"Set1"))
sample.groups <- unique(dfDesign$sample.group)
group.cols <- selcol(length(sample.groups))
dfDesign$dataseries_color <- "green"
for (i in 1:length(sample.groups)){
dfDesign[dfDesign$sample.group == sample.groups[i], "dataseries_color"] = group.cols[i]
}
return(dfDesign)
}
## End: 0c createDesignFileCrickASFsamples ##
###############################################################################
###############################################################################
## (0d) addDGEcomparisons2DesignFile ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
addDGEcomparisons2DesignFile <- function(
dfDesign = 'dfDesign',
comparisonList = 'list(
"WT_vs_MT" = c("MT_YapTaz", "WT_WT")
)'
){
for (i in 1:length(comparisonList)){
comparison <- paste0("comp_", i)
dfDesign[[comparison]] <- ""
dfDesign[grep(comparisonList[[i]][1], dfDesign$sample.group), comparison] <-
paste0("1_",comparisonList[[i]][1])
dfDesign[grep(comparisonList[[i]][2], dfDesign$sample.group), comparison] <-
paste0("2_",comparisonList[[i]][2])
}
dfDesign <- dfDesign[order(
#dfDesign$dataseries,
dfDesign$sample.group,
dfDesign$sample.id),
]
return(dfDesign)
}
## End (0d) ##
###############################################################################
###############################################################################
## (00) hpcClusterSubmisison ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
hpcClusterSubmisison <- function(
CMDstring = "ls",
jobname = "SBhpcRun",
cores = 1,
memPerCpu = 7000,
logFN = "report.file.slurm",
commands.text.file = "commands.txt"
){
hpcString <- c(
'sbatch --time=12:00:00 --wrap "',
CMDstring,
'" --job-name=',
jobname,
' -c ',
cores,
' --mem-per-cpu=',
memPerCpu,
' -o ',
logFN,
' >> ',
commands.text.file
)
hpcString <- paste(hpcString, collapse = "")
return(hpcString)
}
## Done cluster submission ##
###############################################################################
setGeneric(
name="hpcClusterSubmission",
def=function(obj){
}
)
###############################################################################
# (4) create.trim.galore.shell.script #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createTrimGaloreShellScript",
def=function(
obj,
scriptVecSlot = "scriptVec"
){
tempShellScriptVector <- as.vector(NULL, mode = "character")
## Parameters ##
FASTQ = unique(obj@dfDesign$sample.id)
obj@dfDesign$FASTQ <- obj@dfDesign$sample.id
logdir = gsub("workdir", "logs", obj@parameterList$workdir)
## Create core script ##
tempShellScriptVector <- c(
tempShellScriptVector,
"#!/bin/sh",
"\n",
"###############################################################################",
"\n",
"## Adapter- and Quality triming using trimgalore ##",
"\n",
"## Parameters: ##",
"\n",
paste0("## Minimum read length: ", obj@parameterList$TrimGaloreMinLength, "bp"),
"\n",
paste0("## Quality score cut-off: ", obj@parameterList$TrimGaloreMinQuality),
"\n",
"\n",
paste0("if [ ! -d ",logdir," ]; then"),
"\n",
paste0(" mkdir ", logdir),
"\n",
"fi",
"\n",
"\n",
paste0(
"project=",
obj@parameterList$project_id
),
"\n",
"\n",
"#################################################################################",
"\n",
"##FUNCTIONS######################################################################",
"\n",
"#################################################################################",
"\n",
"\n",
"wait_on_lsf() { ## wait on jobs{",
"\n",
"sleep 300",
"\n",
"n=`squeue --name=$project | wc -l`",
"\n",
#cat('n=n-1'); cat('\n');
"while [ $n -ne 1 ]",
"\n",
"do",
"\n",
"n=`squeue --name=$project | wc -l`",
"\n",
#cat('n=n-1'); cat('\n');
"((z=$n-1))",
"\n",
"#number of running",
"\n",
"echo \"$project jobs running: $z\"",
"\n",
"#number of pending",
"\n",
"sleep 300",
"\n",
"done",
"\n",
"}",
"\n",
"\n",
"\n",
"\n",
paste0("module load ",obj@parameterList$ModuleTrimGalore),
"\n",
"\n"
)
## Removing this -a AGATCGGAAGAGC should enable outodetection of adaptor
if (obj@parameterList$paired.end){
cmd.part1 = paste0("trim_galore -q ",obj@parameterList$TrimGaloreMinQuality," --paired --length ",obj@parameterList$TrimGaloreMinLength," -o ", obj@parameterList$fastqDir, " ")
for (i in 1:length(FASTQ)){
r1 = paste0(obj@parameterList$fastqDir, FASTQ[i], "_R1.fastq.gz")
r2 = paste0(obj@parameterList$fastqDir, FASTQ[i], "_R2.fastq.gz")
cmd = paste0(cmd.part1, r1, " ", r2)
cmd1 = paste0(
"sbatch --time=12:00:00 --wrap '",
cmd,
"' --job-name=",
obj@parameterList$project_id ,
" -c 1 --mem-per-cpu=7000 -o ",
logdir,
FASTQ[i],
".cutadapt.slurm >> commands.txt"
)
cmd2 = paste0(
"echo '",
cmd,
"' >> commands.txt"
)
tempShellScriptVector <- c(
tempShellScriptVector,
"## Running in paired-end mode",
"\n",
cmd1,
"\n",
cmd2,
"\n",
"echo '###################################################' >> commands.txt",
"\n",
"\n"
)
}
} else {
cmd.part1 = paste0("trim_galore -q ",obj@parameterList$TrimGaloreMinQuality," --length ",obj@parameterList$TrimGaloreMinLength," -o ", obj@parameterList$fastqDir, " ")
for (i in 1:length(FASTQ)){
r1 = paste0(
obj@parameterList$fastqDir,
FASTQ[i],
"_R1.fastq.gz"
)
cmd = paste0(
cmd.part1,
r1
)
cmd1 = paste0(
"sbatch --time=12:00:00 --wrap '",
cmd,
"' --job-name=",
obj@parameterList$project_id ,
" -c 1 --mem-per-cpu=7000 -o ",
logdir,
FASTQ[i],
".cutadapt.slurm >> commands.txt"
)
cmd2 = paste0(
"echo '",
cmd,
"' >> commands.txt"
)
tempShellScriptVector <- c(
tempShellScriptVector,
"## Running in single-end mode",
"\n",
cmd1,
"\n",
cmd2,
"\n",
"\n",
"echo '###################################################' >> commands.txt",
"\n",
"\n"
)
}
}
## Wait until all files are done ##
tempShellScriptVector <- c(
tempShellScriptVector,
'wait_on_lsf',
'\n',
'\n'
)
## Rename output files ##
for (i in 1:length(FASTQ)){
if (obj@parameterList$paired.end){
r1 = paste0(obj@parameterList$fastqDir, FASTQ[i], "_trimgalore_R1.fastq.gz")
r2 = paste0(obj@parameterList$fastqDir, FASTQ[i], "_trimgalore_R2.fastq.gz")
tg.r1 = gsub(
"_R1.fastq.gz",
"_R1_val_1.fq.gz",
paste0(
obj@parameterList$fastqDir,
FASTQ[i],
"_R1.fastq.gz"
)
)
tg.r2 = gsub(
"_R2.fastq.gz",
"_R2_val_2.fq.gz",
paste0(
obj@parameterList$fastqDir,
FASTQ[i],
"_R2.fastq.gz"
)
)
cmd1 = paste0(
"mv ",
tg.r1,
" ",
r1
)
cmd2 = paste0(
"mv ",
tg.r2,
" ",
r2
)
tempShellScriptVector <- c(
tempShellScriptVector,
cmd1,
'\n',
cmd2,
'\n'
)
} else {
r1 = paste0(
obj@parameterList$fastqDir,
FASTQ[i],
"_trimgalore_R1.fastq.gz"
)
tg.r1 = gsub(
"_R1.fastq.gz",
"_R1_trimmed.fq.gz",
paste0(
obj@parameterList$fastqDir,
FASTQ[i],
"_R1.fastq.gz"
)
)
cmd1 = paste0(
"mv ",
tg.r1,
" ",
r1
)
tempShellScriptVector <- c(
tempShellScriptVector,
cmd1,
"\n"
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
"echo '###################################################' >> commands.txt",
"\n",
"###############################################################################",
"\n"
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
"\n",
"\n",
"wait_on_lsf",
"\n",
"\n",
"## End of adaptor- and quality triming script ##",
"\n",
"###############################################################################",
"\n",
"\n",
"\n",
"\n"
)
## Write shell script to file ##
#setwd(localWorkDir)
# sink(paste0(project.code, ".trimgalore.script.sh"))
#for (i in 1:length(tempShellScriptVector)){
# cat(tempShellScriptVector[i])
#}
#sink()
#If created on a windows machine, don't forget to
#tr -d '\r' <X.cutadapt.script.sh> conv.x.cutadapt.script.sh
## Edit 20180305 ##
#df.design[["FASTQ_trimgalore"]] = paste0(df.design$FASTQ, "_trimgalore")
obj@dfDesign[["FASTQ_trimgalore"]] = paste0(obj@dfDesign$sample.id, "_trimgalore")
result.list <- list(
"obj" = obj,
"ShellScriptVector" = tempShellScriptVector
)
#return(result.list)
obj <- add2vec(
obj = obj,
slot_name = scriptVecSlot,
value = tempShellScriptVector
)
return(obj)
}
)
## End of function ##
###############################################################################
###############################################################################
# (6)create.alignment.shell.script #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createAlignmentShellScript",
def=function(
obj,
scriptVecSlot = "scriptVec"){
###############################################################################
#Create shell script for FASTQC and RSEM #
###############################################################################
#FASTCQ commands and parameters
#load.fastqc.module <- "module load FastQC/0.11.5-Java-1.8.0_92"
fastqc.exe <- "fastqc"
FASTQC_option1 <- "--threads 8 "
if (obj@parameterList$stranded) {
obj@parameterList$forward_prob = "--forward-prob 0.0"
} else {
obj@parameterList$forward_prob = "--forward-prob 0.5"
}
tempShellScriptVector <- as.vector(NULL, mode = "character")
RSEMdir <- paste(
obj@parameterList$workdir,
"RSEM",
sep=""
)
#Alignment to ensemble to pick up non-coding RNAs
#refSeq.index = "/farm/home/patel35/GENOME/hg19/index/RSEM/refseq/19-02-15/hg19"
FASTQ <- unique(obj@dfDesign[,obj@parameterList$AlignFASTQcolumn])
sample.id <- unique(obj@dfDesign$sample.id)
#Create design file
#Start making a shell script that dispatches all samples for FASTQC and RSEM alignment
setwd(obj@parameterList$localWorkDir)
## Begin creation of shell script vector ##
tempShellScriptVector <-c(
'#!/bin/sh',
'\n',
'\n',
'###############################################################################',
'\n',
'## STAR Alignment to reference transcriptome ##',
'\n',
'\n',
'###############################################################################',
'\n',
'# Create required folders #',
'\n',
'###############################################################################',
'\n',
'\n',
paste0("if [ ! -d ", RSEMdir," ]; then"),
'\n',
paste0(" mkdir -p ", RSEMdir),
'\n',
'fi',
'\n',
'\n',
paste0("if [ ! -d ", obj@parameterList$logDir," ]; then"),
'\n',
paste0(" mkdir -p ", obj@parameterList$logDir),
'\n',
'fi',
'\n',
'\n',
paste0("if [ ! -d ", obj@parameterList$FASTQCdir," ]; then"),
'\n',
paste0(" mkdir -p ", obj@parameterList$FASTQCdir),
'\n',
'fi',
'\n',
'\n',
paste0("if [ ! -d ", obj@parameterList$AlignOutputEnsDir," ]; then"),
'\n',
paste0(" mkdir -p ", obj@parameterList$AlignOutputEnsDir),
'\n',
'fi',
'\n',
'\n',
'\n'
)
if (obj@parameterList$paired.end){
tempShellScriptVector <-c(
tempShellScriptVector,
"## Mode: Paired-end",
'\n',
'\n'
)
} else {
tempShellScriptVector <-c(
tempShellScriptVector,
"## Mode: Single-end",
'\n',
'\n'
)
}
for (i in 1:length(FASTQ)){
sample.label <- sample.id[i]
## QC using FASTQC
if (obj@parameterList$paired.end){
FQ.R1 = paste(FASTQ[i], "_R1.fastq.gz", sep="")
FQ.R2 = paste(FASTQ[i], "_R2.fastq.gz", sep="")
FASTQC.CMD.R1 <- paste(
fastqc.exe,
" --outdir ",
obj@parameterList$FASTQCdir," ",
FASTQC_option1,
obj@parameterList$fastqDir,
FQ.R1,
sep=''
)
FASTQC.CMD.R2 <- paste(
fastqc.exe,
" --outdir ",
obj@parameterList$FASTQCdir," ",
FASTQC_option1,
obj@parameterList$fastqDir,
FQ.R2,
sep=''
)
FASTQC.CMD.R1 <- paste(
'sbatch --time=12:00:00 --wrap "',
FASTQC.CMD.R1,
'" --job-name=',
obj@parameterList$project_id,
' -c 1 --mem-per-cpu=7000 -o ',
obj@parameterList$logDir,
"/",
FASTQ[i],
'_R1.fastqc.slurm >> commands.txt',
sep=''
)
FASTQC.CMD.R2 = paste(
'sbatch --time=12:00:00 --wrap "',
FASTQC.CMD.R2,
'" --job-name=',
obj@parameterList$project_id,
' -c 1 --mem-per-cpu=7000 -o ',
obj@parameterList$logDir, "/", FASTQ[i], '_R2.fastqc.slurm >> commands.txt',
sep=''
)
} else {
FQ.R = paste(FASTQ[i], "_R1.fastq.gz", sep="")
FASTQC.CMD.R <- paste(
fastqc.exe,
" --outdir ",
obj@parameterList$FASTQCdir,
" ",
FASTQC_option1,
obj@parameterList$fastqDir,
FQ.R,
sep=''
)
FASTQC.CMD.R = paste(
'sbatch --time=12:00:00 --wrap "',
FASTQC.CMD.R,
'" --job-name=',
obj@parameterList$project_id,
' -c 1 --mem-per-cpu=7000 -o ',
obj@parameterList$logDir,
sample.id[i],
'.fastqc.slurm >> commands.txt',
sep=''
)
}
## Prepare RSEM command ##
## Build RSEM base command ##
temp <- paste(
"--temporary-folder ",
obj@parameterList$workdir,
"RSEM/Ensembl/",
sep=""
)
temp.folder <- paste(
temp,
sample.label,
"/temp/",
sep=""
)
RSEM.exe <- paste0(
"rsem-calculate-expression ",
temp.folder, " ",
"--star ",
"--num-threads 6 ",
"--calc-ci", " ",
"--ci-memory 10240 ",
"--estimate-rspd ",
"--seed 1 ",
"--star-output-genome-bam ",
"--star-gzipped-read-file ",
obj@parameterList$forward_prob
)
mkdir <- paste(
obj@parameterList$AlignSampleDir,
"/", sample.label,
sep=""
)
if (obj@parameterList$paired.end){
input.fastq.files = paste0(
obj@parameterList$AlignForwardProb," ",
"--paired-end ", obj@parameterList$fastqDir, FQ.R1, " ", obj@parameterList$fastqDir, FQ.R2, " "
)
} else {
input.fastq.files = paste0(
obj@parameterList$AlignForwardProb," ",
obj@parameterList$fastqDir, FQ.R, " ")
}
RSEM.CMD <- paste(
RSEM.exe,
input.fastq.files,
obj@parameterList$genomeIndex ," ",
obj@parameterList$AlignOutputEnsDir, sample.label, " > ", obj@parameterList$logDir,
sample.label, ".log", " ", "2>&1", sep=""
)
RSEM.CMD = paste(
'sbatch --time=12:00:00 --wrap "',
RSEM.CMD,
'" --job-name=',
obj@parameterList$project_id,
' -c 12 --mem-per-cpu=7000 -o ',
obj@parameterList$logDir,
sample.id[i],
'.RSEM.slurm >> commands.txt',
sep=''
)
tempShellScriptVector <-c(
tempShellScriptVector,
'###############################################################################',
'\n',
paste(
"# Start",
sample.id[i],
"-submission #",
sep=""
),
'\n',
'###############################################################################',
'\n',
'\n',
paste0("if [ ! -d ", mkdir," ]; then"),
'\n',
paste0(" mkdir -p ", mkdir),
'\n',
'fi',
'\n',
'\n',
'\n',
'## FASTQC ##',
'\n',
obj@parameterList$ModuleFASTQC,
'\n',
'\n'
)
if (obj@parameterList$paired.end){
tempShellScriptVector <-c(
tempShellScriptVector,
FASTQC.CMD.R1,
'\n'
)
} else {
tempShellScriptVector <-c(
tempShellScriptVector,
FASTQC.CMD.R,
'\n'
)
}
tempShellScriptVector <-c(
tempShellScriptVector,
'\n',
'## RSEM-STAR ##',
'\n',
'module load Perl/5.24.0-foss-2016b',
'\n',
'module load RSEM/1.3.0-foss-2016b',
'\n',
'module load STAR/2.5.2a-foss-2016b',
'\n',
'\n',
#cat('module load Bowtie2/2.2.9-foss-2016b'); cat('\n');cat('\n');
RSEM.CMD,
'\n',
'\n'
)
} #End of shell script loop for one sample
tempShellScriptVector <- c(
tempShellScriptVector,
"## End of FASTQC and alignment script ##",
"\n",
"###############################################################################",
"\n",
"\n",
"\n",
"\n",
"##wait on jobs",
"\n",
"wait_on_lsf",
"\n",
"# Feature to be activated",
"\n",
"module load RSEM/1.2.31-foss-2016b",
"\n"
)
## Create RSEM part ##
obj@parameterList$RSEMcountDataFile <- paste0(
obj@parameterList$project_id,
".count.data.txt"
)
samples = as.vector(
unique(
obj@dfDesign$sample.id
)
)
files <- paste(
obj@parameterList$workdir,
"RSEM/Ensembl/",
samples,
".genes.results",
sep=""
)
files = paste(
files,
collapse = " "
)
RSEM.CMD = paste(
"rsem-generate-data-matrix ",
files,
" > ./RSEM/",
obj@parameterList$RSEMcountDataFile,
sep=""
)
tempShellScriptVector <- c(
tempShellScriptVector,
"## RSEM Create data matrix ##",
"\n",
RSEM.CMD,
"\n"
)
## Done creating shell script line vector ##
#sink(
# paste0(
# project.code,
# ".create.fastq.and.rsem.cmds.for.all.samples.sh"
# )
#)
#for (i in 1:length(tempShellScriptVector)){
# cat(tempShellScriptVector[i])
#}
#sink();
#returnList <- list(
# "obj" = obj,
# "shellScriptVector" = tempShellScriptVector
#)
obj <- add2vec(
obj = obj,
slot_name = scriptVecSlot,
value = tempShellScriptVector
)
return(obj)
}
)
## End of function ############################################################
###############################################################################
###############################################################################
# (5) create.rnaseqc.script #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#'
#'
create.rnaseqc.script <- function(
df.design,
sample.column = "sample.id",
project.code = "p103",
project="SB_RNAseqQC",
basedataDir="/camp/stp/babs/working/boeings/Projects/103_VTL_ES_RNA_seq_BAFF_timecourse_hs/workdir/RSEM/Ensembl",
bam.suffix = "STAR.genome.bam",
GTFfile="/camp/stp/babs/working/data/genomes/homo_sapiens/ensembl/GRCh38/release-86/gtf/Homo_sapiens.GRCh38.86.rnaseqc.gtf",
rRNAfile="/camp/stp/babs/working/data/genomes/homo_sapiens/ensembl/GRCh38/release-86/gtf/Homo_sapiens.GRCh38.86.rRNA.list",
genome.fa="/camp/stp/babs/working/data/genomes/homo_sapiens/ensembl/GRCh38/release-86/genome/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa",
refFlatFile="",
ribosomalIntervalList="",
bedFile="",
paired.end = FALSE,
strandSpecific = TRUE
){
tempShellScriptVector <- as.vector(NULL, mode = "character")
if (strandSpecific){
strandedness<- "SECOND_READ_TRANSCRIPTION_STRAND"
} else {
strandedness<- "NONE"
}
## Order samples so that sortin is > condition > sample.group > sample.id
#df.design <- df.design[order(df.design$dataseries, df.design$sample.group, df.design$sample.id),]
samples = as.vector(
unique(
df.design[,sample.column]
)
)
tempShellScriptVector <- c(
tempShellScriptVector,
'#!/bin/sh',
'\n',
'#Copy this shell script into the project directory and run it from there.',
'\n',
'#################################################################################',
'\n',
'##Create log directory ##########################################################',
'\n',
'if [ ! -d logs ]; then',
'\n',
' mkdir logs',
'\n',
'fi',
'\n',
'\n',
'#################################################################################',
'\n',
'###VARIABLES#####################################################################',
'\n',
'#################################################################################',
'\n',
paste0('project="', project, '"'),
'\n',
'projectID=""',
'\n',
'#Path to the directory with the BAM files', '\n',
'#FASTQ files have to be named [sample_name_as_given_in_samples_below]_R1.fastq.gz or [sample_name_as_given_in_samples_below]_R2.fastq.gz', '\n',
paste0('alignDir="', basedataDir, '"'), '\n',
paste0('GTFfile="', GTFfile, '"'), '\n',
#paste0('GTFfile_RNASeQC="', GTFfile.RNASeQC, '"'), '\n',
paste0('rRNAfile="', rRNAfile, '"'),
'\n',
'\n',
paste0('samplesuffix="',bam.suffix,'"'),
'\n',
'\n',
paste0('genome_fa="',genome.fa,'"'), '\n'
)
for (i in 1:length(samples)){
if (i ==1){
tempShellScriptVector <- c(
tempShellScriptVector,
paste0('samples="', samples[i])
)
} else {
tempShellScriptVector <- c(
tempShellScriptVector,
'\n', samples[i]
)
}
}
tempShellScriptVector <- c(
tempShellScriptVector,
'"', '\n',
'\n',
'\n',
'#################################################################################', '\n',
'##FUNCTIONS######################################################################', '\n',
'#################################################################################', '\n',
'\n',
'wait_on_lsf() { ## wait on jobs{', '\n',
'sleep 300', '\n',
'n=`squeue --name=$project | wc -l`', '\n',
#'n=n-1',
'\n',
'while [ $n -ne 1 ]', '\n',
'do', '\n',
'n=`squeue --name=$project | wc -l`', '\n',
#'n=n-1',
'\n',
'((z=$n-1))', '\n',
'#number of running', '\n',
'echo "$project jobs ($projectID) running: $z"',
'\n',
'#number of pending', '\n',
'sleep 300', '\n',
'done', '\n',
'}', '\n',
'\n',
'\n',
'\n',
'## End of function ##', '\n',
'#################################################################################', '\n',
'\n', '\n',
'#################################################################################', '\n',
'# Prere bam files for RNASeQC #', '\n',
'#################################################################################', '\n',
'#################################################################################', '\n',
'# AddOrReplaceReadGroups #', '\n',
'#################################################################################', '\n','\n',
'projectID="AddOrReplaceReadGroups"', '\n',
'module load R/3.3.1-foss-2016b-bioc-3.3-libX11-1.6.3', '\n','\n',
'echo "module load R/3.3.1-foss-2016b-bioc-3.3-libX11-1.6.3" >> commands.txt', '\n',
'module load picard/2.1.1-Java-1.8.0_112', '\n','\n',
'echo "module load picard/2.1.1-Java-1.8.0_112" >> commands.txt', '\n',
'echo "redo headers on Tophat bam output"', '\n',
'echo "#Submitted jobs" >> commands.txt ', '\n',
'#projectID="headers"', '\n',
'for sample in $samples', '\n',
' do ', '\n',
' echo "java -jar ${EBROOTPICARD}/picard.jar AddOrReplaceReadGroups \\', '\n',
' I=${alignDir}/${sample}.${samplesuffix} \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' RGID=$sample \\', '\n',
' RGCN=CCCB \\', '\n',
' RGLB=lib1 \\', '\n',
' RGPL=ILLUMINA \\', '\n',
' RGPU=NA \\', '\n',
' RGSM=accepted_hits.bam" >> commands.txt', '\n',
' sbatch --time=12:00:00 --wrap "java -jar ${EBROOTPICARD}/picard.jar AddOrReplaceReadGroups \\', '\n',
' I=${alignDir}/${sample}.${samplesuffix} \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' RGID=$sample \\', '\n',
' RGCN=TheFrancisCrickInstitute \\', '\n',
' RGLB=lib1 \\', '\n',
' RGPL=ILLUMINA \\', '\n',
' RGPU=NA \\', '\n',
' RGSM=accepted_hits.bam" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.addorreplacereadgroups.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n', '\n',
'#################################################################################', '\n',
'# SortSam #', '\n',
'#################################################################################', '\n', '\n',
'projectID="SortSam"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "SortSam co-ordinate sort" >> commands.txt', '\n',
'echo "SortSam output"', '\n',
'#projectID="coordinate_sort"', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar SortSam \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
' SO=coordinate \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt ', '\n',
' sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar SortSam \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.bam \\', '\n',
' O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
' SO=coordinate \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.sortsam.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'for sample in $samples', '\n',
'do', '\n',
'echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.bam" >> commands.txt ', '\n',
'rm ${alignDir}/${sample}.accepted_hits.readgroups.bam ', '\n',
'done', '\n',
'\n',
'#################################################################################', '\n',
'# Reorder SAM #', '\n',
'#################################################################################', '\n', '\n',
'projectID="ReorderSam"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "reorder reads to match contigs in the reference" >> commands.txt', '\n',
'echo "Reorder reads to match contigs in the reference"', '\n',
'for sample in $samples', '\n',
'do ', '\n',
'echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar ReorderSam \\', '\n',
'I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
'O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
'REFERENCE=${genome_fa} \\', '\n',
'TMP_DIR=\'pwd\'/tmp" >> commands.txt ', '\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar ReorderSam \\', '\n',
'I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam \\', '\n',
'O=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
'REFERENCE=${genome_fa} \\', '\n',
'TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.reordersam.slurm >> commands.txt ', '\n',
'\n',
'done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam" >> commands.txt', '\n',
' rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.bam', '\n',
' done', '\n',
'\n',
'#################################################################################', '\n',
'# MarkDuplicates #', '\n',
'#################################################################################', '\n', '\n',
'projectID="MarkDuplicates"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="markdups"', '\n',
'echo "mark duplicates" >> commands.txt', '\n',
'echo "mark duplicates"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar MarkDuplicates \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
' O=${alignDir}/${sample}.d.bam \\', '\n',
' METRICS_FILE=${alignDir}/${sample}/dup_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
' sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar MarkDuplicates \\', '\n',
' I=${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam \\', '\n',
' O=${alignDir}/${sample}.d.bam \\', '\n',
' METRICS_FILE=${alignDir}/${sample}/dup_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.markduplicates.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'##############################', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam" >> commands.txt', '\n',
' rm ${alignDir}/${sample}.accepted_hits.readgroups.sorted.reordered.bam', '\n',
' done', '\n',
'#################################################################################', '\n',
'# Samtool indexing #', '\n',
'#################################################################################', '\n','\n',
'projectID="Samtool indexing"', '\n',
'module load SAMtools/1.3.1-foss-2016b', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'echo "module load SAMtools/1.3.1-foss-2016b" >> commands.txt', '\n',
'echo "Samtool Indexing "', '\n',
'for sample in $samples', '\n',
' do', '\n',
' echo "samtools index ${alignDir}/${sample}.d.bam" >> commands.txt', '\n',
' sbatch --time=12:00:00 --wrap "samtools index ${alignDir}/${sample}.d.bam" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.samtools.index.slurm >> commands.txt ', '\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n'
)
###########################################################################
## Add rnaseq metrics ##
if (refFlatFile != "" & ribosomalIntervalList != ""){
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run CollectRnaSeqMetrics #', '\n',
'#################################################################################', '\n', '\n',
'projectID="CollectRnaSeqMetrics"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="CollectRnaSeqMetrics"', '\n',
'echo "CollectRnaSeqMetrics" >> commands.txt', '\n',
'echo "CollectRnaSeqMetrics"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar CollectRnaSeqMetrics \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.output.RNA_Metrics \\', '\n',
paste0(' REF_FLAT=',refFlatFile,' \\'), '\n',
paste0(' STRAND=',strandedness,' \\'), '\n',
paste0(' RIBOSOMAL_INTERVALS=',ribosomalIntervalList,' \\'), '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar CollectRnaSeqMetrics \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.output.RNA_Metrics \\', '\n',
paste0(' REF_FLAT=',refFlatFile,' \\'), '\n',
paste0(' STRAND=',strandedness,' \\'), '\n',
paste0(' RIBOSOMAL_INTERVALS=',ribosomalIntervalList,' \\'), '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.rnaseqmetrics.slurm >> commands.txt ', '\n',
'\n',
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
}
## End rnaseq metrics ##
###########################################################################
###########################################################################
## Estimate library complexity ##
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run Estimate Library Complexity #', '\n',
'#################################################################################', '\n', '\n',
'projectID="EstimateLibraryComplexity"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="EstimateLibraryComplexity"', '\n',
'echo "EstimateLibraryComplexity" >> commands.txt', '\n',
'echo "EstimateLibraryComplexity"', '\n',
' for sample in $samples', '\n',
' do', '\n',
' echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar EstimateLibraryComplexity \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.est_lib_complex_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" >> commands.txt', '\n',
'\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTPICARD}/picard.jar EstimateLibraryComplexity \\', '\n',
' I=${alignDir}/${sample}.d.bam \\', '\n',
' O=${alignDir}/${sample}.est_lib_complex_metrics.txt \\', '\n',
' TMP_DIR=\'pwd\'/tmp" --job-name=$project -c 2 --mem-per-cpu=7000 -o logs/$sample.estimatelibrarycomplexity.slurm >> commands.txt ', '\n',
'\n',
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
## Done estimating library complexity ##
###########################################################################
###########################################################################
## Add infer experiment ##
if (bedFile != ""){
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run RNASeQC Infer_experiment #', '\n',
'#################################################################################', '\n', '\n',
'projectID="Infer_experiment"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'projectID="Infer_experiment"', '\n',
'echo "Infer_experiment" >> commands.txt', '\n',
'echo "Infer_experiment"', '\n',
'module purge;','\n',
'module load RSeQC/2.6.4-foss-2016b-Python-2.7.12-R-3.3.1;', '\n',
' for sample in $samples', '\n',
' do', '\n',
paste0(
'echo "infer_experiment.py -r ',
bedFile,
' -i ${alignDir}/${sample}.d.bam > ${alignDir}/${sample}.infer_experiment.txt" >> commands.txt \\'
),
'\n',
'\n',
paste0('sbatch --time=12:00:00 --wrap "infer_experiment.py -r ',
bedFile,
' -i ${alignDir}/${sample}.d.bam > ${alignDir}/${sample}.infer_experiment.txt" --job-name=$project -c 1 --mem-per-cpu=7000 -o logs/$sample.infer_experiment.slurm >> commands.txt '
),
'\n',
' done', '\n',
'##wait on jobs', '\n',
'wait_on_lsf', '\n',
'\n',
'\n'
)
}
## End rnaseq metrics ##
###########################################################################
tempShellScriptVector <- c(
tempShellScriptVector,
'#################################################################################', '\n',
'# Run RNASeqC #', '\n',
'#################################################################################', '\n','\n',
'projectID="RNAseQC"', '\n',
'echo "###########################################################################################" >> commands.txt', '\n',
'module load RNA-SeQC/1.1.8-Java-1.7.0_80', '\n', '\n',
'echo "module load RNA-SeQC/1.1.8-Java-1.7.0_80" >> commands.txt', '\n',
'echo "make RNASeqC sample list"', '\n',
'cd ${alignDir}', '\n',
'\n',
'if [ ! -d ${projectID} ]; then', '\n',
' mkdir ${projectID}', '\n',
'fi', '\n',
'\n',
'\n',
'echo "sample list" > ${alignDir}/${projectID}/sample.list', '\n',
'for sample in $samples', '\n',
'do', '\n',
'echo -e "$sample\t${alignDir}/${sample}.d.bam\tNA" >>${alignDir}/${projectID}/sample.list', '\n',
'done', '\n',
'#wait', '\n',
'\n',
'echo "run RNA-seqQC"', '\n',
'\n',
'#RNASeqQC requires Java version 1.7 and does not run on the most recent version. ', '\n',
'\n',
'echo "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTRNAMINSEQC}/RNA-SeQC_v1.1.8.jar \\', '\n'
)
if (!paired.end){
tempShellScriptVector <- c(
tempShellScriptVector,
'-singleEnd \\', '\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'-o ${alignDir}/${projectID}/ \\', '\n',
'-r ${genome_fa} \\', '\n',
'-s ${alignDir}/${projectID}/sample.list \\', '\n',
'-t $GTFfile \\', '\n',
'-gatkFlags \'-S SILENT -U ALLOW_SEQ_DICT_INCOMPATIBILITY\' \\', '\n',
'-rRNA $rRNAfile " >> commands.txt', '\n',
'\n',
'sbatch --time=12:00:00 --wrap "java -Xmx10g -Djava.io.tmpdir=\'pwd\'/tmp -jar ${EBROOTRNAMINSEQC}/RNA-SeQC_v1.1.8.jar \\', '\n'
)
if (!paired.end){
tempShellScriptVector <- c(
tempShellScriptVector,
'-singleEnd \\', '\n'
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
'-o ${alignDir}/${projectID}/ \\', '\n',
'-r ${genome_fa} \\', '\n',
'-s ${alignDir}/${projectID}/sample.list \\', '\n',
'-t $GTFfile \\', '\n',
'-gatkFlags \'-S SILENT -U ALLOW_SEQ_DICT_INCOMPATIBILITY\' \\', '\n',
'-rRNA $rRNAfile " --job-name=$project -c 1 --mem-per-cpu=7000 -o ${alignDir}/rnaseqc.slurm >> ${alignDir}/commands.txt ', '\n',
'\n',
'\n',
'wait_on_lsf',
'\n',
'module purge; module use /camp/stp/babs/working/software/modules/all; module load multiqc/1.3-2016b-Python-2.7.12',
'\n',
paste0("multiqc ", workdir),
'\n',
'#end of file', '\n'
)
sink(paste0(project.code, '.rnaseqc.script.sh'))
for (i in 1:length(tempShellScriptVector)){
cat(tempShellScriptVector[i])
}
sink()
return(tempShellScriptVector)
}
###############################################################################
#End Create RNASeqQC script #
###############################################################################
###############################################################################
## (7b) createbulkRNASeqAnalysisBashScripts ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createbulkRNASeqAnalysisBashScripts <- function(
obj = "biologic object",
scriptVecSlot = "scriptVec"
){
###############################################################################
## Create shell script to rename files ##
tempShellScriptVector <- as.vector(NULL, mode = "character")
tempShellScriptVector <- c(
tempShellScriptVector,
"###############################################################################",
"\n",
"## Creating softlinks for fastq files in ASF seq storage ##",
"\n"
)
for (i in 1:nrow(obj@dfDesign)){
string <- paste0(
"ln -s ",
obj@dfDesign$original.NGS[i],
" ",
obj@dfDesign$NGS[i]
)
tempShellScriptVector <- c(
tempShellScriptVector,
string,
"\n"
)
}
tempShellScriptVector <- c(
tempShellScriptVector,
"## Done creating softlinks to ASF seq storage ##",
"\n",
"###############################################################################",
"\n",
"\n",
"\n",
"\n"
)
## Write to shell script ##
# setwd(obj@parameterList$localWorkDir)
# sink("create.fastq.softlinks.sh")
#for (i in 1:length(tempShellScriptVector)){
# cat(tempShellScriptVector[i])
#}
#sink()
## Add to overall shell script documentation
## Remove end of line characters ##
#print("Remove end of line characters: tr -d '\r' <create.fastq.softlinks.sh> conv.create.fastq.softlinks.sh")
## Run ##
#print("sh conv.create.fastq.softlinks.sh")
# awk '{ sub("\r$", ""); print }' windows.txt > unix.txt
#awk 'sub("$", "\r")' unixfile.txt > winfile.txt
###############################################################################
## Create strings for documentation ##
## fastq folder ##
## Create documentation module here ##
value <- c(
paste0(
"FASTQ file location:",
unlist(obj@parameterList$pathToSeqStorageFolder)
),
"FASTQ file names:",
obj@dfDesign$original.NGS,
"",
"Project FASTQ file names:",
obj@dfDesign$NGS,
""
)
obj <- add2vec(
obj = obj,
slot_name = "documentationVector",
value = value
)
if (obj@parameterList$paired.end){
value <- c(
value,
"Alignment mode: paired-end"
)
} else {
value <- c(
value,
"Alignment mode: single-end"
)
}
if (obj@parameterList$stranded){
value <- c(
value,
"This is a stranded dataset"
)
} else {
value <- c(
value,
"This is not a stranded dataset"
)
}
value <- c(
value,
paste0("Reference genome/transcriptome: ",
obj@parameterList$genome,
"-",
obj@parameterList$release
)
)
obj <- add2vec(
obj,
slot_name = "documentationVector",
value = value
)
###########################################################################
## Temporary adding tempshell script vec ##
obj <- add2vec(
obj = obj,
slot_name = scriptVecSlot,
value = tempShellScriptVector
)
tempShellScriptVector <- ""
## Add here: automated creation of powerpoint slide.
## Powerpoint slides to be generated:
## Documentation slide
## MA plot for each comparison slide
## PCA plot slide
## Heatmap slide
## Sample names/specifications slide
## Cluster dendrogram slide
## RNASeqQC slide
## End create strings for documentation ##
###############################################################################
###############################################################################
# Create trim galore shell script #
###############################################################################
obj <- createTrimGaloreShellScript(
obj = obj
)
# If this shell script is created on a windows machine, don't forget to remove the end of line '\r' characters
#print(
# paste0(
# "tr -d '\r' <",
# project.code,
# ".trimgalore.script.sh> conv.",
# project.code,
# ".trimgalore.script.sh"
# )
#)
# Write dfDesign to file so it can be re-read once the alignment is done
# setwd(localWorkDir)
# write.table(dfDesign, design.file, row.names= FALSE, sep="\t")
###############################################################################
# Align #
###############################################################################
obj <- createAlignmentShellScript(
obj = obj
)
# If this shell script is created on a windows machine, don't forget to remove the end of line '\r' characters
# print(
# paste0(
###############################################################################
# Prepare RNAseQC script #
###############################################################################
obj <- createRNAseqQCscript(
obj = obj,
bamSuffix = "STAR.genome.bam",
scriptVecSlot = "scriptVec"
)
# If this shell script is created on a windows machine, don't forget to remove the end of line '\r' characters
# print(
# paste0(
# "tr -d '\r' <",
# project.code,
# ".rnaseqc.script.sh> conv.",
# project.code,
# ".rnaseqc.script.sh"
# )
# )
## Add to shell script documentation vector ##
###############################################################################
## Produce shell script ##
fn <- paste0(
obj@parameterList$localWorkDir,
obj@parameterList$project_id,
".documentationShell.script.sh"
)
sink(fn)
scriptVec <- slot(obj, "scriptVec")
for (i in 1:length(scriptVec)){
cat(scriptVec[i])
}
sink()
## use http://hilite.me/ for code conversion and display in script.php
## Done producing documentatin shell script ##
###############################################################################
print("If the script was created on a Windows machine remove end of line characters by running:")
print(
paste0(
"tr -d '\r' <",obj@parameterList$project_id,".documentationShell.script.sh> conv.",obj@parameterList$project_id,".documentationShell.script.sh"
)
)
#returnList <- list(
# "dfDesign" = dfDesign,
# "shellScriptVector" = shellScriptVector,
# "documentationVector" = documentationVector
#)
return(obj)
}
## End (7b) create createbulkRNASeqAnalysisBashScripts ##
###############################################################################
###############################################################################
# (7) Create tpm and fpkm value tables #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.tpm.and.fpkm.tables <- function(
workdir,
samples,
files){
for (i in 1:length(files)) {
df.temp = read.delim(
files[i],
header=TRUE,
sep="\t",
stringsAsFactors = FALSE
)
df.temp.tpm = df.temp[,c("gene_id", "TPM")]
df.temp.fpkm = df.temp[,c("gene_id", "FPKM")]
names(df.temp.tpm)[2] = paste(
samples[i],
names(df.temp.tpm)[2],
sep="_"
)
names(df.temp.fpkm)[2] = paste(
samples[i],
names(df.temp.fpkm)[2],
sep="_"
)
if (i > 1){
df.tpm = merge(
df.tpm,
df.temp.tpm,
by.x = "gene_id",
by.y = "gene_id",
all=TRUE
)
df.fpkm = merge(
df.fpkm,
df.temp.fpkm,
by.x = "gene_id",
by.y = "gene_id",
all=TRUE
)
} else {
df.tpm = df.temp.tpm
df.fpkm = df.temp.fpkm
}
}
## Make numeric ##
df.tpm[,grep("TPM", names(df.tpm))] <- apply(df.tpm[,grep("TPM", names(df.tpm))],2,as.numeric)
df.fpkm[,grep("FPKM", names(df.fpkm))] <- apply(df.fpkm[,grep("FPKM", names(df.fpkm))],2,as.numeric)
list.tpm.fpkm = list(df.tpm = df.tpm, df.fpkm=df.fpkm)
return(list.tpm.fpkm)
}
## End of function ##
###############################################################################
###############################################################################
# (8a) readAndPrepareCountMatrix #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
readAndPrepareCountMatrix <- function(
count.data.fn = 'paste0(raw.count.dir, "/",count.data.file)',
string.to.be.deleted.in.raw.counts.columns = 'paste0("X", gsub("/", ".",paste0(workdir, "RSEM/Ensembl/")))',
df.design = "dfDesign"
){
## Read raw counts file ##
raw.counts <- read.delim(
file=count.data.fn,
header=TRUE,
stringsAsFactors = FALSE
)
rownames(raw.counts) <- raw.counts[,1]
raw.counts <- raw.counts[,2:ncol(raw.counts)]
raw.counts <- round(raw.counts)
colnames(raw.counts) <- gsub(
".genes.results",
"",
colnames(raw.counts)
)
colnames(raw.counts) <- gsub(
string.to.be.deleted.in.raw.counts.columns,
"",
colnames(raw.counts)
)
## Reorder raw counts according to df.design ##
raw.counts <- raw.counts[,unique(df.design$sample.id)]
#reduce raw.counts table
#Filtering step was taken out, as DESeq2 does not require it
raw.counts.filt = data.matrix(raw.counts)
## Transform raw.counts filt to integer ##
rn <- row.names(raw.counts.filt)
raw.counts.filt <- apply(
raw.counts.filt,
2,
as.integer
)
row.names(raw.counts.filt) <- rn
return(raw.counts.filt)
}
## ##
###############################################################################
###############################################################################
## (46) selectHeatmapGenes() ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
selectHeatmapGenes <- function(
dfData = "df.summary",
selCol = "logFC",
cutOff = 0.75,
pSelCol = "padj",
pCutOff = 1,
zeroOneCol = "logFC_cut_off",
geneID = "hgnc_symbol"
){
dfData[[zeroOneCol]] <- 0
cols <- names(dfData)[grep(selCol, names(dfData))]
pCols <- names(dfData)[grep(pSelCol, names(dfData))]
pos <- grep(zeroOneCol, cols)
if (length(pos) > 0) {
cols <- cols[-pos]
}
for (i in 1:length(cols)){
if (pCutOff ==1){
dfData[, zeroOneCol] <- ifelse(
((dfData[,cols[i]] > cutOff) | (dfData[,cols[i]] < -1*cutOff)),
dfData[, zeroOneCol]+ 1,
dfData[, zeroOneCol]+ 0
)
} else {
dfData[, zeroOneCol] <- ifelse(
((((dfData[,cols[i]] > cutOff) | (dfData[,cols[i]] < -1*cutOff))) &
(dfData[,pCols[i]] < pCutOff)),
dfData[, zeroOneCol]+ 1,
dfData[, zeroOneCol]+ 0
)
}
}
dfData[dfData[,zeroOneCol] > 1,zeroOneCol] <- 1
count <- nrow(
unique(dfData[dfData[, zeroOneCol] > 0, c(zeroOneCol, geneID)])
)
print(paste0(count, " genes selected."))
return(dfData)
}
## End of function (46) ##
###############################################################################
###############################################################################
## Make heatmap ##
###############################################################################
# Make heatmap function #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
make.hm <- function(
m.df1,
filename = "heatmap",
k.number = 10,
n.colors = 1000,
hclust.method = "complete",
dist.method = "euclidean",
main = "",
Colv = TRUE,
showRowNames = FALSE,
showColNames = TRUE,
plotSeparationLines = FALSE
) {
library(RColorBrewer)
library(gplots)
hclustfunc <- function(x) hclust(x, method = hclust.method)
distfunc <- function(x) dist(x, method = dist.method)
d <- distfunc(m.df1)
fit <- hclustfunc(d)
clusters <- cutree(fit, k = k.number)
nofclust.height <- length(unique(as.vector(clusters)))
a = max(abs(m.df1))
breaks = seq((-1 * a), a, length.out = n.colors)
## Retired 20180228 ##
#hmcols <- colorRampPalette(c("blue", "white", "red"))(n = (length(breaks) - 1))
## Start New:
# blueWhiteRedVec <- rev(
# colorRampPalette(brewer.pal(9, "RdBu"))(3)
# )
blueWhiteRedVec <- c("#3060cf", "#fffbbc","#c4463a")
hmcols <- colorRampPalette(
blueWhiteRedVec
)(n = (length(breaks) - 1))
selcol <- colorRampPalette(brewer.pal(12, "Set3"))
selcol2 <- colorRampPalette(brewer.pal(9, "Set1"))
clustcol.height = selcol2(nofclust.height)
if (filename != ""){
pdf(
paste(
filename,
"pdf",
sep = "."
)
)
}
if (showRowNames){
labRowVec = row.names(m.df1)
} else {
labRowVec = rep("", nrow(m.df1))
}
if (showColNames){
labColVec = colnames(m.df1)
} else {
labColVec = rep("", length(colnames(m.df1)))
}
if (plotSeparationLines) {
colsep = c(0: ncol(m.df1))
rowsep = c(0: nrow(m.df1))
} else {
colsep = c(0, ncol(m.df1))
rowsep = c(0, nrow(m.df1))
}
hm = heatmap.2(
m.df1,
trace = "none",
dendrogram = "both",
density.info = "none",
keysize = 1,
key = TRUE,
Colv = Colv,
hclust = hclustfunc, distfun = distfunc, col = hmcols,
symbreak = T,
labRow = labRowVec,
labCol = labColVec,
RowSideColors = clustcol.height[clusters],
margins = c(10, 10),
cexCol = 1,
cexRow = 0.5,
srtCol = 45,
srtRow = 0,
main = main,
breaks = breaks,
sepcolor = "black",
sepwidth = c(5e-04, 5e-05),
colsep = colsep,
rowsep = rowsep
)
if (filename != ""){
dev.off()
}
sorted = m.df1[
match(
rev(
labels(hm$rowDendrogram)),
rownames(m.df1)
),
]
sorted = sorted[, hm$colInd]
if (filename != ""){
pdf(paste(filename, "colorkey.pdf", sep = "."))
}
plot.new()
par(lend = 1)
legend("topleft", legend = 1:nofclust.height, col = clustcol.height,
lty = 1, lwd = 10)
if (filename != ""){
dev.off()
}
df.res = list(sorted = sorted, clusters = clusters)
return(df.res)
#return(clusters)
}
## End make heatmap ##
###############################################################################
###############################################################################
## (1) Datatable.to.website.ptm ##
###############################################################################
# df data input
# Requires a logFC mention in the contrast_X_
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
#'
datatable.to.website.ptm <- function (
df.data,
gene.id.column = "ENSMUSG",
heatmap.genes = "", #Relevant genes has to be the same id class as in gene.id.column
n.cluster.genes = 6000,
count.data = FALSE,
logFC.cut.off = 0, # Either 0 or 1. If 1, then df.data needs to contain
# a logFC_cut_off column that is either 0 (exclude row in heatmap)
# or 1 (include row in heatmap)
selector4heatmap.cols = "logFC",
heatmap.preprocessing = "lg2.row.avg", # possible: "lg2", "lg2.row.avg", "none"
hm.cut.off = 4,
n.hm.cluster = 10,
count.cut.off.filter = 1
) {
###########################################################################
## Prepare data table ##
# Remove all rows not featuring as an entry in the primary.gene.id.column
df.data <- df.data[!is.na(df.data[, gene.id.column]), ]
df.data <- unique(df.data)
df.data[is.na(df.data)] <- ""
# Enable filtering of low count rows
if (length(grep("^count_cut_off$", names(df.data))) == 0 ){
if (count.data){
df.data[["count_cut_off"]] <- 0
df.data[,"count_cut_off"] <- rowSums(df.data[,grep("norm_counts", names(df.data))])
df.data[,"count_cut_off"] <- df.data[,"count_cut_off"]/length(grep("norm_counts", names(df.data)))
} else {
df.data[["count_cut_off"]] <- 5
}
}
df.data <- df.data[df.data$count_cut_off > count.cut.off.filter,]
df.data[["row_id"]] <- paste(
rep("R", nrow(df.data)),
1:nrow(df.data),
sep = ""
)
## Calculate coeficient of variation based on norm_counts column for each row
df.data["CoVar"] <- 0
## Ignore low-intesity rows ##
df.data[df.data$count_cut_off > 1,"CoVar"] <- apply(
df.data[df.data$count_cut_off > 1, grep("^norm_counts_", names(df.data))],
1,
function(x) sd(x)/mean(x)
)
df.data[is.na(df.data)] <- 0
df.data[df.data$CoVar == Inf, "CoVar"] <- max(df.data[df.data$CoVar < Inf ,"CoVar"])
# Order from highest to lowest CoVar
df.data <- df.data[order(df.data$CoVar, decreasing = TRUE),]
df.data[["CoVarOrder"]] <- 1:nrow(df.data)
# Select columns for heatmaps and plot display
df.lg2.row.avg.table <- df.data[, grep(selector4heatmap.cols, names(df.data))]
row.names(df.lg2.row.avg.table) <- df.data[, "row_id"]
## Remove column handle from heatmap column ##
if (length(grep("contrast_", names(df.lg2.row.avg.table))) > 0){
names(df.lg2.row.avg.table) <- gsub("contrast_", "", names(df.lg2.row.avg.table))
## Remove contrast number ##
names(df.lg2.row.avg.table) <- substr(
names(df.lg2.row.avg.table),
2,
100
)
} else if (length(grep("norm_counts_", names(df.lg2.row.avg.table))) > 0){
names(df.lg2.row.avg.table) <- gsub(
"norm_counts_",
"",
names(df.lg2.row.avg.table)
)
}
## Take care of double digit contrast numbers ##
names(df.lg2.row.avg.table) <- gsub(
"^_",
"",
names(df.lg2.row.avg.table)
)
names(df.lg2.row.avg.table) <- paste(
"lg2_avg_",
names(df.lg2.row.avg.table),
sep = ""
)
# Ensure numericness
df.lg2.row.avg.table[, grep("lg2_avg", names(df.lg2.row.avg.table))] <- apply(
df.lg2.row.avg.table[, grep("lg2_avg", names(df.lg2.row.avg.table))],
2,
as.numeric
)
## End df.lg2.row.avg.table creation for all rows ##
###########################################################################
## Create heatmap parameters and default selections ##
# Data preprocessing accoring to selection #
if (heatmap.preprocessing == "lg2"){
for (i in 1:nrow(df.lg2.row.avg.table)) {
df.lg2.row.avg.table[i, ] <- log2(df.lg2.row.avg.table[i,])
}
} else if (heatmap.preprocessing == "lg2.row.avg"){
# Calculate row means
row_means <- rep(0, nrow(df.lg2.row.avg.table))
for (i in 1:nrow(df.lg2.row.avg.table)){
temp.row <- df.lg2.row.avg.table[i, grep("lg2_avg", names(df.lg2.row.avg.table))]
temp.row <- temp.row[temp.row != 0]
if (length(temp.row) > 0){
row_means[i] <- mean(temp.row)
}
}
## Retired 20160621 ## Start ##
#row_means <- apply(
# df.lg2.row.avg.table[, grep("lg2_avg", names(df.lg2.row.avg.table))], 1, mean
#)
## Retired 20160621 ## End ##
# Avoid devison by 0
row_means[row_means == 0] <- 0.001
for (i in 1:nrow(df.lg2.row.avg.table)) {
df.lg2.row.avg.table[i, ] <- log2(df.lg2.row.avg.table[i,]/row_means[i])
}
}
# If 'none' or anything else is selected for heatmap processing, The values will be used as 'is' for
# the heatmap display
## Set all Infs to 0 ##
df.lg2.row.avg.table[df.lg2.row.avg.table == Inf ] <- 0
df.lg2.row.avg.table[df.lg2.row.avg.table == -Inf ] <- 0
# Limit top/bottom values of heatmap display
df.lg2.row.avg.table[df.lg2.row.avg.table > hm.cut.off] <- hm.cut.off
df.lg2.row.avg.table[df.lg2.row.avg.table < (-1) * hm.cut.off] <- (-1) * hm.cut.off
df.lg2.row.avg.table[, "row_id"] <- row.names(df.lg2.row.avg.table)
df.data = merge(df.data, df.lg2.row.avg.table, by.x <- "row_id", by.y = "row_id")
df.data = na.omit(df.data)
row.names(df.data) = make.names(df.data[, "row_id"])
## Make gene selection for heatmap ##
# If the selection is provided in the heatmap.genes vector
# these genes are used
# Create logFC_cut_off column if not present in dataset
if (length(grep("logFC_cut_off", names(df.data))) == 0){
df.data[["logFC_cut_off"]] <- 0
}
if (sum(df.data$logFC_cut_off) > 0){
df.hm.sel <- df.data[df.data$logFC_cut_off == 1,]
} else {
df.hm.sel <- df.data
}
if (heatmap.genes[1] == "" | is.na(heatmap.genes[1])){
## Select gene subset for heatmap based on coefficient of variation
heatmap.genes <- as.vector(
unique(
df.hm.sel[,gene.id.column]
)
)
} else {
# Make sure all listed gene IDs are present in the dataset
heatmap.genes <- heatmap.genes[heatmap.genes %in% df.hm.sel[,gene.id.column]]
}
# Done selecting heatmap genes #
# Limiting number of genes for heatmap display accoring to specifications #
## Query logFC limitation ##
# Limit based on Coeficient of variation #
if (length(heatmap.genes) > n.cluster.genes){
dfSel <- df.hm.sel
#row.names(dfSel) <- NULL
dfSel <- unique(dfSel[,c("CoVar", "CoVarOrder", gene.id.column)])
dfSel <- dfSel[order(dfSel$CoVarOrder),]
heatmap.genes <- as.vector(dfSel[,gene.id.column])[1:n.cluster.genes]
}
# Create df.cluster #
if (sum(df.data$logFC_cut_off) > 0){
df.cluster <- df.data[
df.data[,gene.id.column] %in% heatmap.genes &
df.data$logFC_cut_off == 1,
grep("lg2_avg", names(df.data))
]
} else {
df.cluster <- df.data[
df.data[,gene.id.column] %in% heatmap.genes,
grep("lg2_avg", names(df.data))
]
}
## Done selecting genes for heatmap ##
###############################################################################
###############################################################################
# Make heatmap function #
###############################################################################
## Function definition moved to package ##
## Flatening ##
df.cluster[df.cluster > hm.cut.off] = hm.cut.off
df.cluster[df.cluster < (-1) * hm.cut.off] = (-1) * hm.cut.off
m.cluster = data.matrix(df.cluster)
m.cluster[is.na(m.cluster)] = 0
colnames(m.cluster) <- gsub("lg2_avg_", "", colnames(m.cluster))
#m.cluster[m.cluster == 0] = 0.01
## Make col.sorted heatmap ##
hm.res = make.hm(
m.cluster,
filename = "heatmap.col.sorted",
k.number = n.hm.cluster,
n.colors = 20,
hclust.method = "ward.D2",
dist.method = "euclidean",
main = "",
Colv = FALSE
)
## Make col-clustered heatmap ##
hm.res = make.hm(
m.cluster,
filename = "heatmap.col.clustered",
k.number = n.hm.cluster,
n.colors = 20,
hclust.method = "ward.D2",
dist.method = "euclidean",
main = "",
Colv = TRUE
)
## Extract cluster order ##
df.clust.order <- data.frame(hm.res$sorted)
cluster.ordered.sample.vector <- names(df.clust.order)
df.clust.order[["cluster_order"]] <- 1:nrow(df.clust.order)
df.clust.order[, "row_id"] <- row.names(df.clust.order)
df.clust.order <- df.clust.order[, c("row_id", "cluster_order")]
## Extract sample order ##
df.sample.order <- hm.res$sorted
sampleColClustOrder <- names(data.frame(df.sample.order))
## Re-order lg2_avg and norm_counts accordingly ##
renameVec <- names(df.data)
# Remove lg2_avg_entries #
oldLog2AvgEntries <- names(df.data)[grep("lg2_avg", names(df.data))]
newLog2AvgEntries <- paste0("lg2_avg_", sampleColClustOrder)
if (sum(!(oldLog2AvgEntries %in% newLog2AvgEntries)) == 0){
renameVec <- renameVec[!(renameVec %in% newLog2AvgEntries)]
renameVec <- c(
renameVec,
newLog2AvgEntries
)
}
# Remove norm_counts #
oldNormCountsEntries <- names(df.data)[grep("norm_counts_", names(df.data))]
newNormCountsEntries <- paste0("norm_counts_", sampleColClustOrder)
if (sum(!(oldNormCountsEntries %in% newNormCountsEntries)) == 0){
renameVec <- renameVec[!(renameVec %in% newNormCountsEntries)]
renameVec <- c(
renameVec,
newNormCountsEntries
)
}
## Reorder columns in df.data ##
if (sum(!(names(df.data) %in% renameVec)) == 0){
df.data <- df.data[,renameVec]
}
## Add to main data table ##
remove <- as.vector(
na.omit(
match(
df.clust.order[, "row_id"],
df.data[, "row_id"]
)
)
)
id.vector = as.vector(df.data[-remove, "row_id"])
df.rest = data.frame(id.vector, rep(0, length(id.vector)))
names(df.rest) = names(df.clust.order)
df.clust.order = rbind(df.clust.order, df.rest)
df.data = merge(df.data, df.clust.order, by.x = "row_id",
by.y = "row_id")
df.data = df.data[!is.na(df.data[, gene.id.column]), ]
df.data = unique(df.data)
# Add cluster id
df.cluster.id <- data.frame(na.omit(hm.res$clusters))
df.cluster.id[["row_id"]] <- row.names(df.cluster.id)
names(df.cluster.id)<- c("cluster_id", "row_id")
df.cluster.id <- df.cluster.id[grep("R", df.cluster.id$row_id),]
# Adding all other ids
row_id <- df.data[!(df.data$row_id %in% df.cluster.id$row_id), "row_id"]
cluster_id <- rep(0, length(row_id))
df.add <- rbind(df.cluster.id, data.frame(cluster_id, row_id))
df.data <- merge(df.data, df.add, by.x = "row_id", by.y="row_id", all=TRUE)
df.data[is.na(df.data)] = ""
# # Add gene descripton
# if (!exists("df.anno")){
# df.anno <- read.delim(
# gene.id.table,
# header = TRUE,
# sep = "\t",
# stringsAsFactors = FALSE
# )
# }
#
# # Remove all entries from df.anno that are not present in df.data
# df.anno <- df.anno[df.anno[,gene.id.column] %in% df.data[,gene.id.column],]
# if (!add.uniprot.column){
# df.anno$uniprot = NULL
# df.anno <- unique(df.anno)
# }
#
# df.anno <- unique(df.anno)
#
# df.data <- merge(
# df.data,
# df.anno,
# by.x = gene.id.column,
# by.y = gene.id.column,
# all=TRUE
# )
#
# df.data[is.na(df.data)] = ""
# df.data = unique(df.data)
#
# if (gene.id.column == "mgi_symbol" | gene.id.column == "ENSMUSG") {
# ENSG <- "ENSMUSG"
# } else if (gene.id.column == "hgnc_symbol" | gene.id.column == "ENSG") {
# ENSG <- "ENSG"
# }
#
# if (gene.id.column != "mgi_symbol" | gene.id.column != "hgnc_symbol"){
# df.data$gene_description <- paste0(
# df.data$gene_description,
# " (",
# df.data[,gene.id.column],
# ")"
# )
# }
###########################################################################
## Deal with PTM datasets ##
if (length(grep("p_site_env", names(df.data))) > 0) {
#Trim if the sequence window is to big
length <- nchar(df.data$p_site_env)
center <- ((length -1)/2)
df.data$p_site_env <- ifelse(
(length > 15),
substr(df.data$p_site_env, center-6,center+8),
df.data$p_site_env
)
one = tolower(substr(df.data$p_site_env, 1, 7))
two = toupper(substr(df.data$p_site_env, 8, 8))
three = tolower(substr(df.data$p_site_env, 9, 16))
df.data$p_site_env = paste(one, two, three, sep = "")
################################################################################
#Add ppos columns to datatable
################################################################################
ppos.vec = c("ppos_minus_7","ppos_minus_6","ppos_minus_5","ppos_minus_4","ppos_minus_3","ppos_minus_2","ppos_minus_1","ppos",
"ppos_plus_1", "ppos_plus_2","ppos_plus_3","ppos_plus_4","ppos_plus_5","ppos_plus_6","ppos_plus_7")
#In this dataset not all sequences are associated with an p_site_env
#df.data[df.data$p_site_env == "", "p_site_env"] = substr(df.data[df.data$p_site_env == "", "sequence_window"], 9,23)
for (i in 1:length(ppos.vec)){
df.data[[ppos.vec[i]]] = sapply(
df.data$p_site_env,
function(x) substr(x, i,i))
}
# Done adding ppos columns
}
## Done dealing with PTM datasets ##
###########################################################################
df.data[is.na(df.data)] = ""
df.data = unique(df.data)
df.data = df.data[!is.na(df.data[, gene.id.column]), ]
df.data[["row_names"]] = 1:nrow(df.data)
names(df.data) = gsub("[.]", "_", names(df.data))
names(df.data) = gsub(" ", "_", names(df.data))
return(df.data)
}
## End of function ##
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @import openxlsx
#' @export
#'
createAndFormatExcelOutputFiles <- function(
obj,
metaCoreCountFilter = 1,
customOutputCols = NULL,
addedOutputCols = NULL
){
###############################################################################
## Create Excel output table ##
if (length(customOutputCols) > 0){
outCols <- customOutputCols
} else {
outCols <- c(
obj@parameterList$geneIDcolumn,
obj@parameterList$primaryAlignmentGeneID,
"gene_description",
"gene_type",
names(obj@databaseTable)[grep("contrast_", names(obj@databaseTable))],
names(obj@databaseTable)[grep("norm_counts_", names(obj@databaseTable))],
names(obj@databaseTable)[grep("raw_counts_", names(obj@databaseTable))],
"count_cut_off",
"CoVar"
)
}
if (length(addedOutputCols) > 0){
outCols <- c(
outCols,
addedOutputCols
)
}
outCols <- outCols[outCols %in% names(obj@databaseTable)]
dfOutput <- unique(obj@databaseTable[,outCols])
## Rename columns ##
names(dfOutput) <- gsub("norm_counts_", "", names(dfOutput))
comparisons <- names(obj@dfDesign)[grep("comp_", names(obj@dfDesign))]
for (i in 1:length(comparisons)){
names(dfOutput) <- gsub(
paste0("contrast_", i, "_"),
"",
names(dfOutput)
)
}
outPutFN <- paste0(obj@parameterList$outputDir, obj@parameterList$project_id,".result.table.txt")
write.table(
dfOutput,
outPutFN,
row.names = FALSE,
sep="\t"
)
## Create Excel file ##
#library(openxlsx)
wb <- openxlsx::createWorkbook()
sheet <- substr(paste0(obj@parameterList$project_id, "_full_DGE_result_list"), 1, 30)
addWorksheet(wb, sheet)
freezePane(wb, sheet , firstActiveRow = 2)
## Filter is inactivated, as it does not appear to be compatible with the current version of Excel
#addFilter(wb, 1, row = 1, cols = 1:ncol(dfOutput))
## Style headers ##
hs1 <- openxlsx::createStyle(
fontColour = "#ffffff",
fgFill = "#000000",
halign = "CENTER",
textDecoration = "Bold"
)
openxlsx::writeData(wb, 1, dfOutput, startRow = 1, startCol = 1, headerStyle = hs1)
openxlsx::saveWorkbook(
wb,
gsub(".txt", ".xlsx", outPutFN) ,
overwrite = TRUE
)
## Done creating Excel output table ##
###############################################################################
###############################################################################
## Create metacore table ##
## Apply minimal filtering ##
df.metacore <- obj@databaseTable[obj@databaseTable$count_cut_off >= metaCoreCountFilter,]
## select padj and logFC columns only ##
sel.vec <- names(df.metacore)[grep("contrast_", names(df.metacore))]
sel.vec <- sel.vec[-grep("stat", sel.vec)]
sel.vec <- sel.vec[-grep("lg10p", sel.vec)]
## Remove contrast_x_ prefix > remove front 11 characters
sel.vec <- append(obj@parameterList$geneIDcolumn, sel.vec)
df.metacore <- unique(df.metacore[,sel.vec])
## Remove contrast_x_ tag from column labels ##
comparisons <- names(obj@dfDesign)[grep("comp_", names(obj@dfDesign))]
for (i in 1:length(comparisons)){
names(df.metacore) <- gsub(
paste0("contrast_", i, "_"),
"",
names(df.metacore)
)
}
outPutFN <- paste0(obj@parameterList$outputDir, obj@parameterList$project_id,".metacore.input.file.txt")
write.table(
df.metacore,
outPutFN,
row.names = FALSE,
sep="\t"
)
wb <- openxlsx::createWorkbook()
sheet <- substr(paste0(obj@parameterList$project_id, "_metacore_input_file"), 1, 30)
openxlsx::addWorksheet(wb, sheet)
## Style headers ##
hs1 <- openxlsx::createStyle(
fgFill = "#4F81BD",
halign = "CENTER",
textDecoration = "Bold",
border = "Bottom",
fontColour = "white"
)
openxlsx::writeData(wb, 1, dfOutput, startRow = 1, startCol = 1, headerStyle = hs1)
openxlsx::saveWorkbook(
wb,
gsub(".txt", ".xlsx", outPutFN) ,
overwrite = TRUE
)
## Done creating Metacore input file ##
###############################################################################
###############################################################################
## Metacore analysis ##
## Open file in excel and save as metacore.input.file.xls (2003)
#print("Perform enrichment analysis by subsetting data to logFC cut off: 1, padj 0.05")
## Select Workflows & Reports
## Select Enrichment analysis
## Set threshold: Threshold 1 p-value 0.05 direction both
## Run analysis
## If necessary, repeat with lower theshold
## If successful hit Get report button and safe as
#print(paste0(project.code, ".metacore.results.enrichment.analysis.xls"))
## Next do transcription factor target analysis ##
## Select One-click Analysis > Transcription Factors
## Set FDR threshold to 0.05
#print(paste0("Export result table as: ", project.code, ".metacore.TF.analysis.", "logFC_nonAligned_vs_aligned"))
## Save results as p111.metacore.result.
## For selected TF targets, export MC results an curate into project category ##
## Select transcription factor of interest (Object name)
## Limit selection: Direction: Outgoing Effect Activation Mechanism influence on expression and transcription
## regulation >> Aplly >> Build Network
## Select additional options
## Pre filters Interaction types transcription regulation
## Additional options Directions: Downstream
## Hit build network
## Select all >> File >> Export >> safe as [TFname.mc.targets.xls]
## End Module add metacore results ##
###############################################################################
print("Excel output files create and depoisted in project/outputs folder")
}
## End: (7c) Create Excel output tables ##
###############################################################################
###############################################################################
## Kill connections ##
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
killDbConnections <- function () {
library(RMySQL)
all_cons <- dbListConnections(MySQL())
print(all_cons)
for(con in all_cons)
res <- dbDisconnect(con)
#print(paste(length(all_cons), " connections killed."))
}
## ##
###############################################################################
#' @title inferDBcategories
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
###############################################################################
## Function inferDBcategories ##
inferDBcategories <- function(
dfData
){
dbCatList <- list()
for (i in 1:length(dfData)){
classLabel <- ""
maxStringLength <- max(nchar(as.character(dfData[,i])), na.rm = T) + 2
if (is.numeric(dfData[,i])){
if (is.integer(dfData[,i])){
if (maxStringLength <= 8) {
classLabel <- "INT(8) NULL DEFAULT NULL"
} else {
classLabel <- "BIGINT(8) NULL DEFAULT NULL"
}
} else {
if (max(dfData[,i], na.rm = T) <= 1){
classLabel <- "DECIMAL(6,5) NULL DEFAULT NULL"
} else if (max(dfData[,i], na.rm = T) <= 1000){
classLabel <- "DECIMAL(6,3) NULL DEFAULT NULL"
} else if (max(dfData[,i], na.rm = T) <= 10000){
classLabel <- "DECIMAL(6,1) NULL DEFAULT NULL"
} else {
classLabel <- "DECIMAL(8,0) NULL DEFAULT NULL"
}
}
} else {
## Running as character
classLabel <- "VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci"
if (maxStringLength < 100){
classLabel <- "VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci"
} else if (maxStringLength < 50){
classLabel <- "VARCHAR(50) CHARACTER SET utf8 COLLATE utf8_general_ci"
} else if (maxStringLength < 10){
classLabel <- "VARCHAR(10) CHARACTER SET utf8 COLLATE utf8_general_ci"
}
}
pos <- grep(classLabel, names(dbCatList), fixed=TRUE)
if (length(pos) > 0 ){
dbCatList[[classLabel]] <- c(dbCatList[[classLabel]], paste0("^", names(dfData)[i], "$"))
} else {
dbCatList[[classLabel]] <- paste0("^", names(dfData)[i], "$")
}
}
## Make sure row_names is prsent ##
classLabel <- "BIGINT(8) NULL DEFAULT NULL"
pos <- grep(classLabel, names(dbCatList), fixed=TRUE)
if (length(pos) > 0){
dbCatList[[classLabel]] <- c(dbCatList[[classLabel]], paste0("^row_names$"))
} else {
dbCatList[[classLabel]] <- paste0("^row_names$")
}
return(dbCatList)
}
## EOF inferDB category ##
###############################################################################
###############################################################################
# Function upload pca.table.to.db() #
###############################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
upload.pca.table.to.db <- function(
df.pca,
host = "www.biologic-db.org",
prim.data.db = "vtl_data",
password = "",
db.user = "boeings",
PCAdbTableName = "P79_VTL_ES_PCA"){
# Create color pallets
if (length(grep("sample.group_colors", names(df.pca))) == 0){
sample.group.vec <- names(df.pca)[grep("sample_group", names(df.pca))]
sample.group.color.vec <- gsub("sample_group", "sample_group_colors", sample.group.vec)
for (i in 1:length(sample.group.vec)){
group.size <- length(unique(df.pca[,sample.group.vec[i]]))
#library(RColorBrewer)
#selcol <- colorRampPalette(brewer.pal(9,"YlOrBr"))
library(scales)
group.cols <- hue_pal()(group.size)
assign(sample.group.color.vec[i], group.cols)
assign(sample.group.vec[i], unique(df.pca[,sample.group.vec[i]]))
df.col.pca <- data.frame(get(sample.group.vec[i]), get(sample.group.color.vec[i]))
names(df.col.pca) <- c(sample.group.vec[i], sample.group.color.vec[i])
df.pca <- merge(
df.pca,
df.col.pca,
by.x = sample.group.vec[i],
by.y = sample.group.vec[i],
all = TRUE
)
}
}
# Adjust pca column names if neceessary
if (length(grep("^pca", names(df.pca))) > 0){
names(df.pca) <- gsub("^pca", "PC", names(df.pca))
}
if (length(grep("^PCA", names(df.pca))) > 0){
gsub("^PCA", "PC", names(df.pca))
}
library(RMySQL)
#df.pca should contain three columns: sample_name, sample.group, sample.group_color, pca1, pca2, ..., pcaN
df.pca[["row_names"]] <- 1:nrow(df.pca)
dbDB <- dbConnect(drv = RMySQL::MySQL(), user = db.user, password = password, host = host)
dbGetQuery(dbDB, paste("CREATE DATABASE IF NOT EXISTS ", prim.data.db, sep=""))
dbDB <- dbConnect(drv = RMySQL::MySQL(), user = db.user, password = db.pwd, dbname=prim.data.db, host = host)
dbGetQuery(dbDB, paste("DROP TABLE IF EXISTS ", PCAdbTableName, sep=""))
dbWriteTable(dbDB, PCAdbTableName, df.pca, row.names= FALSE)
dbDB <- dbConnect(drv = RMySQL::MySQL(), user = db.user, password = password, host = host, dbname=prim.data.db)
dbGetQuery(dbDB, paste("ALTER TABLE `",PCAdbTableName,"` ADD UNIQUE(`row_names`)", sep=""))
dbGetQuery(dbDB, paste("ALTER TABLE `",PCAdbTableName,"` ADD PRIMARY KEY(`row_names`)", sep=""))
dbGetQuery(dbDB, paste0(
"ALTER TABLE `",
PCAdbTableName,
"` CHANGE `sample_id` `sample_id` VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci"
)
)
# Assign sample group and sample group color columns
var.char.50.cols <- names(df.pca)[grep("sample.group", names(df.pca))]
if (length(var.char.50.cols) > 0){
for (i in 1:length(var.char.50.cols)){
dbGetQuery(dbDB, paste0(
"ALTER TABLE `",
PCAdbTableName,
"` CHANGE `",
var.char.50.cols[i],
"` `",
var.char.50.cols[i],
"` VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci"
)
)
}
}
dec.6.3.cols <- names(df.pca)[grep("^PC", names(df.pca))]
for (i in 1:length(dec.6.3.cols)){
dbGetQuery(dbDB, paste0("ALTER TABLE ",PCAdbTableName,"
CHANGE `",dec.6.3.cols[i],"` `",dec.6.3.cols[i],"` DECIMAL(6,3) NULL DEFAULT NULL"
)
)
}
}
## End of function ##
###############################################################################
###############################################################################
# (19) upload.datatable.to.database()
###############################################################################
## Indexing of gene name column
# CREATE INDEX idx_mgi_symbol ON p268_rna_seq_table(mgi_symbol)
#' @title upload.datatable.to.database
#'
#' @param host URL or IP address of the database server. NULL if mode is SQLite
#' @param user database user name. Needs privileges for INSERT, DELETE, DROP and SELECT
#' @param password database password
#' @param prim.data.db primary database name
#' @param dbTableName Name of the database table to upload
#' @param df.data data.frame to upload to the database
#' @param db.col.parameter.list This list specifies the category for a database column. This ideally is filled using the function inferDbColumns(dbTableName)
#' @param increment = 5000,
#' @param new.table = FALSE,
#' @param first.row.name.index = 1,
#' @param startOnlyWithConnectionCount1 = FALSE,
#' @param cols2Index = NULL,
#' @param mode = "MySQL"
#'
#' @description mode options: SQLite, MySQL
#' @keywords MySQL SQLite Upload Database
#' @import RMySQL RSQLite DBI
#' @export
#'
#'
# 2021-08-1 Added sqlite options #
upload.datatable.to.database <- function(
host = NULL,
user = NULL,
password = NULL,
prim.data.db = "project.database",
dbTableName = "rnaseqdbTableName",
df.data = "df.data.to.upload",
db.col.parameter.list = list(
"VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("gene_description"),
"VARCHAR(50) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("ENSG", "ENSMUSG", "hgnc_symbol", "mgi_symbol", "uniprot", "entrezgene","display_ptm", "^sequence_window", "p_site_env","for_GSEA_gene_chip","associated_gene_name", "gene_type"),
"VARCHAR(1) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("ppos", "amino_acid", "charge","known_site"),
"BIGINT(8) NULL DEFAULT NULL" = c("row_names"),
"INT(8) NULL DEFAULT NULL" = c("row_id", "cluster_order","cluster_id", "count_cut_off", "^position$", "raw_counts"),
"DECIMAL(6,3) NULL DEFAULT NULL" = c("norm_counts", "NES", "logFC", "lg2_avg", "intensity", "^int", "iBAQ","^localization_prob$"),
"DECIMAL(6,5) NULL DEFAULT NULL" = c("padj", "pvalue","^pep$")
),
increment = 5000,
new.table = FALSE,
first.row.name.index = 1,
startOnlyWithConnectionCount1 = FALSE,
cols2Index = NULL,
indexName = NULL,
mode = "MySQL"
){
if (sum( nchar(names(df.data)) > 64) > 0){
print("Table names clipped to 64 characters.")
names(df.data) <- substr(names(df.data), 1, 64)
}
if (startOnlyWithConnectionCount1){
## helper function ##
getConnectonCount <- function(
user= "user",
password = "password",
dbname = "prim.data.db",
host = "host"){
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite()
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host
)
}
connectionCount <- as.numeric(
DBI::dbGetQuery(
dbDB,
"SELECT COUNT(1) ConnectionCount, user FROM information_schema.processlist WHERE user <> 'system user' AND user = 'boeingS' GROUP BY user;"
)$ConnectionCount
)
DBI::dbDisconnect(dbDB)
return(connectionCount)
}
connectionCount <- getConnectonCount(
user= user,
password = password,
dbname = prim.data.db,
host = host
)
while (connectionCount > 2){
print(paste(connectionCount, "connections open. Sleep 30 seconds and try again."))
Sys.sleep(30)
connectionCount <- getConnectonCount(
user= user,
password = password,
dbname = prim.data.db,
host = host
)
}
}
## Uploading of data frame to database. Happens only if all columns are defined ##
# library(RMySQL)
## Connect to MySQL to check existence of database ##
if (mode == "SQLite"){
prim.data.dbPath <- unlist(strsplit(prim.data.db, "/"))[1:(length(unlist(strsplit(prim.data.db, "/")))-1)]
prim.data.dbPath <- paste0(prim.data.dbPath, collapse = "/")
if (!dir.exists(prim.data.dbPath)){
dir.create(prim.data.dbPath, recursive = T)
}
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
#dbname=prim.data.db,
new.table = TRUE
)
## Create the database if it does not exist already##
res <- DBI::dbGetQuery(
dbDB,
paste(
"CREATE DATABASE IF NOT EXISTS ",
prim.data.db,
sep = ""
)
)
}
RMySQL::dbDisconnect(dbDB)
## Ensure that df.data has a row_names column ##
df.data[["row_names"]] <- first.row.name.index:(first.row.name.index+nrow(df.data)-1)
## Check if all columns are assigned in db.col.parameter.list ##
all.col.string.vec <- as.vector(do.call('c', db.col.parameter.list))
## Create a vector that contains all col names that contain at least in part the string in all.cols.vec
###############################################################################
## Function start ##
get.all.col.names.with.these.strings <- function(all.col.string.vec){
all.assigned.cols <- vector(mode="character", length=0)
for (i in 1:length(all.col.string.vec)){
pos <- grep(all.col.string.vec[i], names(df.data))
if (length(pos) > 0){
all.assigned.cols <- append(all.assigned.cols, names(df.data)[pos])
}
}
return(all.assigned.cols)
}
## End of function ##
###############################################################################
all.assigned.cols <- get.all.col.names.with.these.strings(all.col.string.vec)
## Ensure that all database columns are assigned ##
not.assigned <- names(df.data)[!(names(df.data) %in% all.assigned.cols)]
if (length(not.assigned) == 0){
print("All database columns are defined. Uploading to database...")
} else {
print(
paste0(
"The following database columns have not been defined: ",
paste(not.assigned,
collapse = ', '
),
". Datatable completed.")
)
stop(not.assigned)
}
## Establish connection ##
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname=prim.data.db
)
}
## Remove all tables with the same name from db ##
if (new.table){
res <- DBI::dbExecute(
dbDB,
paste(
"DROP TABLE IF EXISTS ",
dbTableName,
sep = ""
)
)
RMySQL::dbDisconnect(dbDB)
}
## Upload up to increment rows in one go ##
iter <- nrow(df.data)%/%increment
if (nrow(df.data)%%increment != 0){
iter <- iter + 1
}
totalRows <- nrow(df.data)
for (i in 1:iter){
if (nrow(df.data) > increment){
limit <- increment
} else {
limit <- nrow(df.data)
}
df.temp <- df.data[1:limit,]
df.data <- df.data[(increment+1):nrow(df.data),]
uploaded = FALSE
while (!uploaded){
tryCatch({
biologicSeqTools::killDbConnections()
## Establish connection ##
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname=prim.data.db
)
}
## Upload new dataframe to database ##
res <- DBI::dbWriteTable(
dbDB,
dbTableName,
df.temp,
row.names = FALSE,
append = TRUE,
overwrite = FALSE
)
RMySQL::dbDisconnect(dbDB)
uploaded = TRUE
#dbDisconnect(dbDB)
}, error=function(e){cat("Upload errror :",conditionMessage(e), "\n")})
}
print(paste0(i * increment, " rows out of ",totalRows," processed..."))
## Connect to database for dbtable upload ##
}
print("Processing successfully completed")
####################################################
## Function alterDBtable
alterDBtable <- function(
cmd.string = "mysql command",
user = "user",
password = "password",
dbname = "prim.data.db",
host = "host"
){
## Establish connection ##
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname=prim.data.db
)
}
tryCatch({
DBI::dbExecute(
dbDB,
cmd.string
)}, error=function(e) {paste0("Alter not executed. cmd.vector[", i, "]")})
RMySQL::dbDisconnect(dbDB)
}
## End of function ##
######################
mysql.cmd = ""
if (new.table){
alterDBtable(
cmd.string = paste(
"ALTER TABLE `",
dbTableName,
"` ADD UNIQUE(`row_names`)",
sep = ""
),
user = user,
password = password,
dbname = dbname,
host = host
)
## Describe key columns in database table ##
mysql.cmd <- paste(
"ALTER TABLE `",
dbTableName,
"` ADD UNIQUE(`row_names`)",
sep = ""
)
alterDBtable(
cmd.string = paste(
"ALTER TABLE `",
dbTableName,
"` ADD PRIMARY KEY(`row_names`)",
sep = ""
),
user = user,
password = password,
dbname = dbname,
host = host
)
###############################################################################
## Characterize and define secondary database columns ##
for (i in 1:length(db.col.parameter.list)) {
descriptor <- names(db.col.parameter.list[i])
cols.in.class <-
get.all.col.names.with.these.strings(db.col.parameter.list[[i]])
if (length(cols.in.class) > 0) {
print(
paste0(
"Assigned ",
paste0(cols.in.class, collapse = ', '),
" as ",
descriptor, "."
)
)
## Assign column names to MySQL class ##
alteration.string <-
paste0("ALTER TABLE ", dbTableName, " ")
for (j in 1:length(cols.in.class)) {
alteration.string <- paste0(
alteration.string,
paste0(
"CHANGE `", cols.in.class[j], "` `", cols.in.class[j], "` ", descriptor, ", "
)
)
}
## Remove last comma from string
alteration.string <-
substr(alteration.string, 1, (nchar(alteration.string) - 2))
## Carry out alteration
## Connect to database for dbtable upload ##
## Connection is repated to avoid loss of a short lived connection.
alterDBtable(
cmd.string = alteration.string,
user = user,
password = password,
dbname = dbname,
host = host
)
}
#print(alteration.string)
mysql.cmd <- append(mysql.cmd,
alteration.string)
#dbGetQuery(dbDB,
# alteration.string)
}
}
## End characterize and define secondary database columns ##
#############################################################################
## Add index based on row namems ##
if (length(cols2Index) > 0){
for (i in 1:length(cols2Index)){
print("...indexing...")
## Get all existing indixes
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
## get all existing indeces
listCmd <- paste0(
"SELECT name FROM sqlite_master WHERE type = 'index';"
)
indexVec <- as.vector(DBI::dbGetQuery(
dbDB,
listCmd
)[,1])
}
if (!is.null(indexName)){
if (!is.na(indexName[i])){
indexCmdName <- indexName[i]
} else {
indexCmdName <- paste0("idx_",cols2Index[i])
}
} else {
indexCmdName <- paste0("idx_",cols2Index[i])
}
if (mode == "SQLite"){
while (sum(indexCmdName %in% indexVec) > 0){
indexCmdName <- paste0(indexCmdName, sample(9, 1))
}
}
cmd.string <- paste0("CREATE INDEX ",indexCmdName," ON ",dbTableName," (",cols2Index[i],")")
## Establish connection ##
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname=prim.data.db
)
## get all existing indeces
listCmd <- paste0(
"SELECT name FROM sqlite_master WHERE type = 'index';"
)
indexVec <- DBI::dbGetQuery(
dbDB,
listCmd
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname=prim.data.db
)
}
tryCatch({
DBI::dbExecute(
dbDB,
cmd.string
)}, error=function(e) {stop(paste0("Database table not uploaded. Problem adding index ",cols2Index[i],"."))})
DBI::dbDisconnect(dbDB)
print(paste0("Datatable ", dbTableName, " successfully uploaded and column(s) ",paste(cols2Index, collapse = " ")," indexed."))
}
}
return(mysql.cmd)
}
#RENAME TABLE p131_rna_seq_table_part_1 TO p131_rna_seq_table
#INSERT INTO p131_rna_seq_table SELECT * FROM p131_rna_seq_table_part_2;
#INSERT INTO p131_rna_seq_table SELECT * FROM p131_rna_seq_table_part_3;
#INSERT INTO p131_rna_seq_table SELECT * FROM p131_rna_seq_table_part_4;
#INSERT INTO p131_rna_seq_table SELECT * FROM p131_rna_seq_table_part_5;
#DROP TABLE p131_rna_seq_table_part_2;
#DROP TABLE p131_rna_seq_table_part_3;
#DROP TABLE p131_rna_seq_table_part_4;
#DROP TABLE p131_rna_seq_table_part_5;
## End of function ##
###############################################################################
###################################
# (17) Load datatable from database#
###################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
import.db.table.from.db <- function(dbtable = "interpro_categori",
dbname = "reference_categories_db_new",
user = "boeings",
password = "",
host = "www.biologic-db.org"
){
## Helper function ##
oldw <- getOption("warn")
options(warn = -1)
## End helper function ##
library(RMySQL)
## Create connection
dbDB <- dbConnect(MySQL(), user = user, password = password, host = host, dbname=dbname)
## Get number of expected rows from query ##
nrows.to.download <- dbGetQuery(dbDB, paste0("SELECT COUNT(*) FROM ",dbtable))
dbDisconnect(dbDB)
download = TRUE
i=1
while (download) {
dbDB <- dbConnect(MySQL(), user = user, password = password, host = host, dbname=dbname)
out <- tryCatch({
df.data = dbGetQuery(dbDB, paste0("SELECT DISTINCT * FROM ", dbtable))
},
error=function() {
message("Database error")
# Choose a return value in case of error
}
)
dbDisconnect(dbDB)
if (nrow(df.data) == nrows.to.download){
download = FALSE
print(paste0(nrow(df.data), " of ", nrows.to.download, " rows downloaded."))
} else {
print(paste0("Expected: ", nrows.to.download, ". Downloaded: ", nrow(df.data), ". "))
print(paste0("Download failed for the ", i, " time. Try again..."))
i = i+1
}
}
options(warn = oldw)
return(df.data)
}
# End of function
###############################################################################
# (9A) create.GSEA.rnk.files #
###############################################################################
#Create rnk files for all logFC comparisons
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.gsea.rnk.files <- function(
localWorkDir,
df.dataTable,
GSEA.colum.type = "stat",
gene.symbol.column.name = "hgnc_symbol",
medianCollapse = TRUE,
GSEADir = NULL) {
#setwd(localWorkDir)
#Create GSEA directory
if (is.null(GSEADir)){
GSEADir = paste(localWorkDir, "GSEA/", sep = "")
}
if (!dir.exists(GSEADir)) {
dir.create(GSEADir)
}
#Get Stat columns from the table
#setwd(GSEADir)
## Create rnk files ##
stat.samples = names(df.dataTable)[grep(GSEA.colum.type, names(df.dataTable))]
for (i in 1:length(stat.samples)) {
df.rnk = unique(df.dataTable[,c(gene.symbol.column.name, stat.samples[i])])
df.rnk = df.rnk[order(df.rnk[,stat.samples[i]]),]
names(df.rnk) = c(gene.symbol.column.name, GSEA.colum.type)
df.rnk = na.omit(df.rnk)
stat.col.name <-
names(df.rnk)[grep(GSEA.colum.type, names(df.rnk))]
df.rnk = df.rnk[df.rnk[,stat.col.name] != "",]
df.rnk = df.rnk[df.rnk[,gene.symbol.column.name] != "",]
df.rnk[,GSEA.colum.type] = as.numeric(df.rnk[,GSEA.colum.type])
df.rnk <- unique(df.rnk)
## Collapse based on gene id ##
duplicatedIds <- unique(
df.rnk[
duplicated(
df.rnk[,gene.symbol.column.name]),
gene.symbol.column.name
]
)
if (length(duplicatedIds) > 0){
for (j in 1:length(duplicatedIds)){
value <- median(df.rnk[df.rnk[,gene.symbol.column.name] == duplicatedIds[j],stat.col.name])
df.rnk[df.rnk[,gene.symbol.column.name] == duplicatedIds[j],stat.col.name] <- value
}
}
df.rnk <- unique(df.rnk)
## done collapsing ##
write.table(
df.rnk, paste(GSEADir, stat.samples[i], ".rnk", sep = ""), row.names = FALSE,
sep = "\t",
eol = "\r",
quote = F
)
}
#The .rnk files currently need to be opened once in excel and saved before they can be processed by the GSEA
#desktop application
#GSEA parameters
#Collapse dataset to gene symbols = FALSE
#Number of permutations = 1000
#C2: Min. size 15,
#Got to GSEA to calculate category enrichments
#Write result folders to .../GSEA
#setwd(localWorkDir)
print(paste0("GSEA .rnk for all contrasts were created in ", GSEADir))
}
## End of function ##
#################################################################################
################################################################################
# (27) db.cat2gmt() #
################################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
dbcat2gmt <- function(
df.cat, # As downloaded from reference_categories_db_new database
gene.id.column = "hgnc_symbol"
){
cat.list <- sapply(df.cat[,gene.id.column], function(x) list(x) )
names(cat.list) <- as.vector(df.cat[,"cat_id"])
cat.list <- lapply(cat.list, function(x) unlist(strsplit(x, ";")))
cat.list <- lapply(cat.list, function(x) x[x != ""])
## Equalize length of all vectors with empty spaces ##
l2 <- lapply(cat.list, function(x) length(x))
cat.list <- cat.list[l2 > 0]
l2 <- lapply(cat.list, function(x) length(x))
maxVecLength <- max(unlist(l2))
l2 <- lapply(l2, function(x) maxVecLength - x)
l2 <- lapply(l2, function(x) rep("", x))
#library(tidyverse)
#cat.list <- purrr::map2(cat.list, l2,c)
# cat.list <- rep("", length(cat.list))
for (k in 1:length(cat.list)){
cat.list[[k]] <- c(cat.list[[k]], l2[[k]])
}
## Add first two columns ##
df.cat <- df.cat[df.cat$cat_id %in% names(cat.list),]
lColName <- sapply(df.cat[,"cat_name"], function(x) list(x) )
lColCom <- sapply(df.cat[,"comments_1"], function(x) list(x) )
# laddCols <- purrr::map2(lColName, lColCom, c)
laddCols <- list()
#
for (k in 1:length(lColName)){
laddCols[[k]] <- c(lColName[[k]], lColCom[[k]])
}
#cat.list <- purrr::map2(laddCols, cat.list, c)
for (k in 1:length(cat.list)){
cat.list[[k]] <- c(laddCols[[k]], cat.list[[k]])
}
df.gmt <- as.data.frame(do.call(rbind, cat.list))
return(df.gmt)
}
## End function dbcat2gmt() ##
###############################################################################
################################################################################
# (12) create.gmt.file.from.ref.data.table() #
################################################################################
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.gmt.file.from.ref.data.table <- function(
host = 'www.biologic-db.org',
dbname = "reference_categories_db_new",
dataTable = "st_lab_categories",
pwd = "Saturday08",
user="boeings",
gene.id.column = "hgnc_symbol"){
## Step 1: retrieve database table ##
library(RMySQL)
for (i in 1:length(dataTable)){
dbDB <- dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = pwd,
dbname= dbname,
host = host
)
out <- tryCatch({
df.temp = dbGetQuery(
dbDB,
paste("SELECT DISTINCT * FROM ", dataTable[i], sep=""))
},
error=function() {
message("Database error")
df.temp = NA
# Choose a return value in case of error
}
)
dbDisconnect(dbDB)
if (i ==1){
df.res <- df.temp
} else {
df.res <- rbind(df.res, df.temp)
}
}
## Step 2: transform to gmt file ##
###############################################################################
## Function dbcat2gmt ##
## Example of usage is given in 20170505.procedure.gmt2.median.timecourse.r
# dbcat2gmt <- function(
# df.cat, # As downloaded from reference_categories_db_new database
# gene.id.column = "hgnc_symbol"
# ){
# cat.list <- sapply(df.cat[,gene.id.column], function(x) list(x) )
# names(cat.list) <- as.vector(df.cat[,"cat_id"])
# cat.list <- lapply(cat.list, function(x) unlist(strsplit(x, ";")))
# cat.list <- lapply(cat.list, function(x) x[x != ""])
#
# ## Equalize length of all vectors with empty spaces ##
# l2 <- lapply(cat.list, function(x) length(x))
# cat.list <- cat.list[l2 > 0]
#
# l2 <- lapply(cat.list, function(x) length(x))
#
# maxVecLength <- max(unlist(l2))
# l2 <- lapply(l2, function(x) maxVecLength - x)
# l2 <- lapply(l2, function(x) rep("", x))
# library(tidyverse)
#
# cat.list <- purrr::map2(cat.list, l2,c)
#
# ## Add first two columns ##
# df.cat <- df.cat[df.cat$cat_id %in% names(cat.list),]
# lColName <- sapply(df.cat[,"cat_name"], function(x) list(x) )
# lColCom <- sapply(df.cat[,"comments_1"], function(x) list(x) )
#
# laddCols <- purrr::map2(lColName, lColCom, c)
# cat.list <- purrr::map2(laddCols, cat.list, c)
#
# df.gmt <- as.data.frame(do.call(rbind, cat.list))
#
#
# return(df.gmt)
# }
## End function dbcat2gmt() ##
###############################################################################
df.gmt <- dbcat2gmt(
df.cat = df.res,
gene.id.column = gene.id.column
)
df.gmt <- data.frame(df.gmt)
return(df.gmt)
}
##
## End of function create.gmt.file.from.ref.data.table() ##
###############################################################################
###############################################################################
# (9B) create.GSEA.table #
###############################################################################
## Run GSEA
## java -jar ${ROOTGSEA}/gsea.jar -Xmx512m -cp xtools.gsea.GseaPreranked -gmx /camp/stp/babs/working/boeings/Projects/reference_data/GSEA.gmt.files/20160508.rna.seq.txn.analysis.gmt -norm meandiv -nperm 1000 -rnk /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA/contrast_1_logFC_IFNARLA_vs_WTLA30.rnk -scoring_scheme weighted -rpt_label contrast_1_rnaSeqTxnAnalysis -create_svgs false -make_sets true -plot_top_x 100 -rnd_seed timestamp -set_max 500 -set_min 15 -zip_report false -out /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA -gui false
## This appears to work ##
#java -Xmx2512m -cp /camp/stp/babs/working/boeings/Projects/software/gsea-3.0.jar xtools.gsea.GseaPreranked -gmx /camp/stp/babs/working/boeings/Projects/reference_data/GSEA.gmt.files/20160508.rna.seq.txn.analysis.gmt -rnk /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA/contrast_1_logFC_IFNARLA_vs_WTLA30.rnk -rpt_label contrast_1_rnaSeqTxnTest -out /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA -collapse false -mode Max_probe -norm meandiv -nperm 1000 -scoring_scheme classic -include_only_symbols true -make_sets true -plot_top_x 100 -rnd_seed timestamp -set_max 2500 -set_min 10 -zip_report false -gui false
#module load Java/1.8.0_131
#cat(' sbatch --time=12:00:00 --wrap "java -jar ${EBROOTPICARD}/picard.jar AddOrReplaceReadGroups \\'); cat('\n');
#cat(sbatch --time=12:00:00 --wrap "java -Xmx2512m -cp /camp/stp/babs/working/boeings/Projects/software/gsea-3.0.jar xtools.gsea.GseaPreranked -gmx /camp/stp/babs/working/boeings/Projects/reference_data/GSEA.gmt.files/20160508.rna.seq.txn.analysis.gmt -rnk /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA/contrast_1_logFC_IFNARLA_vs_WTLA30.rnk -rpt_label contrast_1_rnaSeqTxnTest -out /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA -collapse false -mode Max_probe -norm meandiv -nperm 1000 -scoring_scheme classic -include_only_symbols true -make_sets true -plot_top_x 100 -rnd_seed timestamp -set_max 2500 -set_min 10 -zip_report false -gui false'" --job-name="job" -c 1 --mem-per-cpu=7000 -o gsea.slurm >> commands.txt'); cat('\n');
#sbatch --time=12:00:00 --wrap "java -Xmx7000m -cp /camp/stp/babs/working/boeings/Projects/software/gsea-3.0.jar xtools.gsea.GseaPreranked -gmx /camp/stp/babs/working/boeings/Projects/reference_data/GSEA.gmt.files/20160508.rna.seq.txn.analysis.gmt -rnk /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA/contrast_1_logFC_IFNARLA_vs_WTLA30.rnk -rpt_label contrast_1_rnaSeqTxnTest -out /camp/stp/babs/working/boeings/Projects/126_SL_MP_RNA_seq_macrophage_stimulation_timecourse/workdir/GSEA -collapse false -mode Max_probe -norm meandiv -nperm 1000 -scoring_scheme classic -include_only_symbols true -make_sets true -plot_top_x 100 -rnd_seed timestamp -set_max 2500 -set_min 10 -zip_report false -gui false" --job-name='job' -c 1 --mem-per-cpu=7000 -o gsea.slurm >> commands.txt
## end of working
#' @title A method
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.GSEA.table <- function(
GSEADir,
logFC.column.name = "logFC",
host = 'www.biologic-db.org',
refdbname= "reference_categories_db_new",
refDBTableName = paste0(project.code, "_VTL_ES_enriched_categories_table"),
db.user = db.user,
db.password = db.pwd,
tables = tables,
df.dataTable,
outputDir = "", # this should be set to projectDir/outputs
project.code = "p001"
){
#####################################################################
#Function#
#####################################################################
compile.gsea.results <- function(folder.name)
{cwd = getwd()
setwd(folder.name)
result.xls.list = list.files()[grep("gsea_report", list.files())]
result.xls.list = result.xls.list[grep(".xls", result.xls.list)]
df.res1 = read.delim(result.xls.list[1], header=TRUE, sep="\t", stringsAsFactors = FALSE)
df.res2 = read.delim(result.xls.list[2], header=TRUE, sep="\t", stringsAsFactors = FALSE)
df.res = rbind(df.res1, df.res2)
setwd(cwd)
return(df.res)
}
#End of function
#Collate GSEA output files
setwd(GSEADir)
folder.list = list.files()[(grep("Gsea", list.files()))]
if (length(grep("error", folder.list)) > 0){
folder.list <- folder.list[-grep("error", folder.list)]
}
samples = unique(substr(folder.list,1, 10))
logFC.samples = names(df.dataTable)[grep(logFC.column.name, names(df.dataTable))]
for (j in 1:length(samples)){
setwd(GSEADir)
folder.list = list.files()[(grep("Gsea", list.files()))]
folder.list = folder.list[grep(samples[j], folder.list)]
if (length(grep("error", folder.list)) > 1){
folder.list <- folder.list[-grep("error", folder.list)]
}
for (i in 1:length(folder.list)){
name = paste("cat.results", samples[j], sep="_")
name.all = paste("all.cat.results", samples[j], sep="_")
assign(name, compile.gsea.results(folder.name=folder.list[i]))
assign(name, get(name)[,c("NAME", "FDR.q.val", "NES")])
if (i == 1){
assign(name.all, get(name))
} else {
assign(name.all, rbind(get(name.all), get(name)))
}
}
assign(name.all, get(name.all)[get(name.all)$FDR.q.val <= 0.05, ])
if (j == 1){
all.samples.vector = name.all
enriched.categories = get(name.all)
names(enriched.categories)[which(names(enriched.categories) == "FDR.q.val")] = paste("padj_", substr(logFC.samples[j], 18, 50), sep="")
names(enriched.categories)[which(names(enriched.categories) == "NES")] = paste("NES_", substr(logFC.samples[j], 18, 50), sep="")
} else {
all.samples.vector = append(all.samples.vector, name.all)
enriched.categories = merge(enriched.categories, get(name.all), by.x="NAME", by.y="NAME", all=TRUE)
enriched.categories[is.na(enriched.categories)] = "1"
names(enriched.categories)[which(names(enriched.categories) == "FDR.q.val")] = paste("padj_", substr(logFC.samples[j], 18, 50), sep="")
names(enriched.categories)[which(names(enriched.categories) == "NES")] = paste("NES_", substr(logFC.samples[j], 18, 50), sep="")
}
}
#######################################################################
#Now, get all image files and compile in one folder via a shell script#
#######################################################################
shellFileVec <- as.vector(NULL, mode="character")
gseaScriptVec <- c("#!/bin/sh", "\n")
gseaTrVec <- as.vector(NULL, mode="character")
gseaShVec <- as.vector(NULL, mode="character")
for (j in 1:length(samples)){
setwd(GSEADir)
folder.list = list.files()[(grep("Gsea", list.files()))]
folder.list = folder.list[grep(samples[j], folder.list)]
if (length(grep("error", folder.list)) > 1){
folder.list <- folder.list[-grep("error", folder.list)]
}
for (i in 1:length(folder.list)){
cwd = getwd()
setwd(folder.list[i])
temp.file.list = list.files()[grep("enplot", list.files())]
temp.path.list = paste(folder.list[i], temp.file.list, sep="/")
temp.target.file = paste("enrichment_plots",samples[j], temp.file.list, sep="/")
temp.contrast = rep(samples[j], length(temp.file.list))
if (i ==1){
full.file.list = temp.file.list
full.path.list = temp.path.list
full.target.list = temp.target.file
contrast.list = temp.contrast
} else {
full.file.list = append(full.file.list, temp.file.list)
full.path.list = append(full.path.list, temp.path.list)
full.target.list = append(full.target.list, temp.target.file)
contrast.list = append(contrast.list, temp.contrast)
}
setwd(cwd)
}
cat_names = sapply(full.file.list, function(x) unlist(strsplit(x, "_")))
cat_names = sapply(cat_names, function(x) paste(x[3:length(x)-1], collapse = "_"))
## Escape all ( and ) in full path list
full.path.list <- gsub("\\(", "\\\\\\(",full.path.list)
full.path.list <- gsub("\\)", "\\\\\\)",full.path.list)
full.path.list <- gsub("\\&", "\\\\\\&",full.path.list)
full.path.list <- gsub("\\;", "\\\\\\;",full.path.list)
full.path.list <- gsub("\\'", "\\\\\\'",full.path.list)
df.files = data.frame(full.file.list, full.path.list, cat_names, full.target.list, contrast.list)
#Create shell script to transfer all png.files
#Destinatin folder GSEA/enrichment_plots
setwd(GSEADir)
gseaShVec <- c(
gseaTrVec,
paste0(
"sh ",
samples[j],
".file.transfer.sh"
)
)
# gseaShVec <- c(
# gseaShVec,
# paste0(
# "sh conv.",
# samples[j],
# ".file.transfer.sh"
# )
# )
#print(paste0("If you create the shell script on a windows machine, remove end of line character with the command: \\n",
# " tr -d '\r' <",paste(samples[j], ".file.transfer.sh", sep=""),"> conv.",
# paste(samples[j], ".file.transfer.sh", sep="")))
## List all transfer scripts ##
shellFileVec <- c(
shellFileVec,
# paste0(
# "tr -d '\\\r' <",
# paste(
# samples[j],
# ".file.transfer.sh",
# sep=""
# ),
# "> conv.",
# paste(
# samples[j],
# ".file.transfer.sh",
# sep="")
# ),
# "",
paste0(
"sh ",
paste(
# "conv.",
samples[j],
".file.transfer.sh",
sep=""
)
),
""
)
## Make individual shell script ##
sink(file=paste(samples[j], ".file.transfer.sh", sep=""))
cat('mkdir -p enrichment_plots'); cat('\n');
cat(paste("mkdir enrichment_plots/",samples[j], sep="")); cat("\n");
if (j ==1){
image.link.array <- paste0(samples[j], "_image_link", sep="")
} else {
image.link.array <- append(image.link.array,paste0(samples[j], "_image_link", sep=""))
}
for (i in 1:length(full.path.list)){
cmd = paste("cp ", full.path.list[i], " ./enrichment_plots/", samples[j], sep="")
# Escape all $ and ( and ) in file names
cat(cmd); cat("\n");
}
sink()
if (j == 1){
df.all.files = df.files
} else {
df.all.files = rbind(df.all.files, df.files)
}
}
## Create masterscript ##
setwd(GSEADir)
sink("GSEAmasterscript.sh")
for (a in 1:length(shellFileVec)){
cat(shellFileVec[a]); cat("\n");
}
sink()
print(
paste0(
"If you create the shell script on a windows machine, remove end of line character with the command: \\n",
" tr -d '\r' <GSEAmasterscript.sh> conv.GSEAmasterscript.sh"
)
)
sink("GSEAplotfileTransfer.sh")
cat("#!/bin/sh");cat("\n");
#for (a in 1:length(gseaTrVec)){
# cat(gseaTrVec[a]); cat("\n");
#}
#cat("\n");
cat("#Copying files");cat("\n");
for (a in 1:length(gseaShVec)){
cat(gseaShVec[a]); cat("\n");
}
sink()
print(
paste0(
"If you create the shell script on a windows machine, remove end of line character with the command: \\n",
" tr -d '\r' <GSEAmasterscript.sh> conv.GSEAmasterscript.sh"
)
)
print(
paste0(
"If you create the shell script on a windows machine, remove end of line character with the command: \\n",
" tr -d '\r' <GSEAplotfileTransfer.sh> conv.GSEAplotfileTransfer.sh"
)
)
#####################
#Upload to database##
#####################
#Create database table
#List of enriched categories: FDR <= 0.25 all.cat.results
#merge all.cat.results$NAME to db_table_cat_name (1:1 match)
library(RMySQL)
#tables = tables[grep("mysigdb", tables)]
#Get all categories from all tables
for (i in 1:length(tables)){
temp.cat.list <- import.db.table.from.db(
host = host,
dbname = refdbname,
dbtable = tables[i],
password = db.pwd,
user = db.user
)
temp.cat.list <- unique(
temp.cat.list[,
c(
"cat_id",
"cat_name",
"cat_type",
"data_source",
"comments_1",
"comments_2",
"cat_item_size"
)
]
)
if (length(grep("mysigdb", tables[i])) == 0){
temp.cat.list$comments_2 = ""
temp.cat.list$cat_name = toupper(temp.cat.list$cat_name)
temp.cat.list$cat_name = gsub(" ", "_", temp.cat.list$cat_name)
temp.cat.list$cat_name = gsub("'", "", temp.cat.list$cat_name)
}
if (i ==1){
full.cat.list = temp.cat.list
} else {
full.cat.list = rbind(full.cat.list, temp.cat.list)
}
}
#Merge with result list
enriched.categories = unique(merge(enriched.categories, full.cat.list, by.x = "NAME", by.y = "cat_name"))
df.all.files = df.all.files[c("cat_names", "full.file.list", "contrast.list", "full.target.list")]
names(df.all.files)= c("cat_names" ,"full.file.list", "contrast", "image_link")
df.all.files$cat_names = sapply(df.all.files$cat_names, function(x) gsub("V_", "V$", x))
for (j in 1:length(samples)){
df.temp = df.all.files[df.all.files$contrast == samples[j],]
df.temp = df.temp[,c("cat_names", "image_link")]
names(df.temp) = c("cat_names", paste(samples[j], "_image_link", sep=""))
enriched.categories = unique(merge(enriched.categories, df.temp, by.x = "NAME", by.y = "cat_names", all=TRUE))
write.table(enriched.categories, "temp.file.txt", row.names=FALSE, sep="\t")
enriched.categories = read.delim("temp.file.txt", header=TRUE, sep="\t", stringsAsFactors = FALSE)
enriched.categories[is.na(enriched.categories)] = ""
}
setwd(GSEADir)
if (dir.exists(outputDir)){
setwd(outputDir)
} else if (outputDir != ""){
dir.create(outputDir)
if (dir.exists(outputDir)){
setwd(outputDir)
}
}
write.table(enriched.categories, paste0(project.code, ".enriched.categories.txt"), row.names=FALSE, sep="\t")
df1 = read.delim(paste0(project.code, ".enriched.categories.txt"), header=TRUE, sep="\t", stringsAsFactors = FALSE)
df1[is.na(df1)] = ""
df1[["enrichment_type"]] = "GSEA"
names(df1)[grep("NAME", names(df1))] = "cat_name"
#names(df1)[grep("FDR.q.val", names(df1))] = "FDR_q_val"
df1 = df1[df1$cat_id != "",]
write.table(df1, paste0(project.code, ".enriched.categories.txt"), row.names=FALSE, sep="\t")
#df1[df1$FDR_q_val == 0, "FDR_q_val"] = 0.0001
df.enriched = df1
#Upload to database
##############################################
# Create score column for ordering columns
##############################################
padj.cols <- names(df.enriched)[grep("padj", names(df.enriched))]
if (length(padj.cols) > 0){
df.enriched[["gsea_display_score"]] <- as.numeric(df.enriched[,padj.cols[1]])
if (length(padj.cols) > 1){
for (l in 2:length(padj.cols)){
df.enriched[,"gsea_display_score"] <- as.numeric(df.enriched[,"gsea_display_score"]) + as.numeric(df.enriched[,padj.cols[l]])
}
}
}
df.enriched[["row_names"]] = 1:nrow(df.enriched)
image.link.vector = names(df.enriched)[grep("image_link", names(df.enriched))]
padj.vector = names(df.enriched)[grep("padj", names(df.enriched))]
NES.vector = names(df.enriched)[grep("NES", names(df.enriched))]
df.enriched <- unique(df.enriched)
upload.datatable.to.database(
host = host,
user = db.user,
password = db.pwd,
prim.data.db = "enriched_categories",
dbTableName = refDBTableName,
df.data = df.enriched,
db.col.parameter.list = list(
"VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("cat_name", "comments_1", "comments_2",image.link.vector),
"VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("cat_id", "cat_type","data_source"),
"VARCHAR(50) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("enrichment_type"),
"BIGINT(8) NULL DEFAULT NULL" = c("row_names"),
"INT(5) NULL DEFAULT NULL" = c("cat_item_size"),
"DECIMAL(6,3) NULL DEFAULT NULL" = c("gsea_display_score", NES.vector),
"DECIMAL(9,8) NULL DEFAULT NULL"= padj.vector
),
increment = 5000,
new.table = TRUE,
first.row.name.index = 1
)
# End upload to database
# Create vector with parameters for ini file
###############################################################################
# The section below needs to relocate to create.parameters #
###############################################################################
###########################
#Create display parameters#
###########################
gsea.cat.lines <- "#################"
gsea.cat.lines <- append(gsea.cat.lines, "#GSEA Parameters#")
gsea.cat.lines <- append(gsea.cat.lines, "#################")
gsea.cat.lines <- append(gsea.cat.lines, paste('$enriched_categories_table="',refDBTableName,'";', sep=""))
enriched.cols = paste(NES.vector, collapse ="','")
enriched.cols = paste("$enriched_padj_cols = array('", enriched.cols, "');", sep="")
gsea.cat.lines <- append(gsea.cat.lines, enriched.cols)
image.link.array <- paste(image.link.array, collapse = "', '")
image.link.array <- paste0("$image_link_array = array('",image.link.array,"');")
gsea.cat.lines <- append(gsea.cat.lines, image.link.array)
return(gsea.cat.lines)
}
## End of function ##
###############################################################################
###############################################################################
## (2B) createSettingsFile() ##
#' @title createSettingsFile
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createSettingsFile <- function(
obj = "object",
df.data = 'database.table',
publicDataset = FALSE,
defaultXcolName = NULL,
defaultYcolName = NULL,
timepointName = NULL,
sample.order = "names(database.table)[grep('norm_counts', names(databse.table))]", #set to "" to go with default sorting
heatmapSampleOrder = "lg2_avg vec",
sample.names = "", # default is sample.order
count.sample.colors = 'rainbow(length(sample.order))',
ptm.colum = "display_ptm",
count.table.headline = "PTM ratio H/L counts for all samples",
count.table.sidelabel = "Counts",
venn.slider.selector.strings = 'c("contrast_x_logFC", "constrast_x_padj")',
plot.selection.strings = 'c(
"_logFC",
"_PCA_",
"_lg10p"
)',
webSiteDir = "/camp/stp/babs/working/boeings/Stefan/protocol_files/github/biologic/src/experiments",
upper_heatmap_limit = 3,
lower_heatmap_limit = -3,
heamap.headline.text = "heamap.headline.text",
project_id = "project_id",
primDataTable = "p123_rna_seq_table",
pcaDbTable = NULL,
pointer = "Gene Symbol:"
){
###############################################################################
## Create timecourse string from dfDesign ##
createTSparams <- function(
dfDesign = Obio@dfDesign,
timepointName = "Timepoint"
) {
tsOrder <- as.numeric(sort(unique(dfDesign$timepoint)))
scriptVec <- as.vector(NULL, mode = "character")
scriptVec <- c(
scriptVec,
"// New Begin: Timecourse",
"'timecourse_chart' => array(",
" 'timepoint_name' => 'Day',",
" 'display_median' => 'calculate_median',",
paste0(" 'timepoint_array' => array(", paste(tsOrder, collapse = ","),"),"),
" 'datasets' => array("
)
if (length(grep("dataseries_order", names(dfDesign))) > 0){
if (length(grep("ts_color", names(dfDesign))) > 0){
dfO <- unique(dfDesign[,c("dataseries", "dataseries_order","ts_color")])
dfO <- dfO[order(dfO$dataseries_order, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- as.vector(dfO$ts_color)
} else {
dfO <- unique(dfDesign[,c("dataseries", "dataseries_order")])
dfO <- dfO[order(dfO$dataseries_order, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- rainbow(length(dataseriesVec))
}
} else {
if (length(grep("ts_color", names(dfDesign))) > 0){
dfO <- unique(dfDesign[,c("dataseries", "ts_color")])
dfO <- dfO[order(dfO$dataseries, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- as.vector(dfO$ts_color)
} else {
dataseriesVec <- sort(unique(dfDesign$dataseries))
dataseriesColVec <- rainbow(length(dataseriesVec))
}
}
for (i in 1:length(dataseriesVec)){
dfTemp <- unique(
dfDesign[dfDesign$dataseries == dataseriesVec[i], c("sample.id", "dataseries", "sample.group", "timepoint")]
)
dfTemp <- dfTemp[order(dfTemp$timepoint, decreasing = F),]
timepointVec <- unique(dfTemp$timepoint)
sampleGroupVec <- unique(dfTemp$sample.group)
scriptVec <- c(
scriptVec,
paste0("'",dataseriesVec[i],"' => array("),
paste0(" 'color' => '",dataseriesColVec[i],"',"),
paste0(" 'sample_group' => array(")
)
for (j in 1:length(sampleGroupVec)){
dfTemp3 <- unique(dfDesign[dfDesign$sample.group %in% sampleGroupVec, c(timepointName, "sample.group")])
dfTemp3 <- dfTemp3[order(dfTemp3[,timepointName], decreasing = F),]
timepointVec <- as.numeric(dfTemp3[,timepointName])
dfTemp2 <- unique(dfTemp[dfTemp$sample.group == sampleGroupVec[j],])
scriptVec <- c(
scriptVec,
paste0("'",sampleGroupVec[j],"' => array("),
paste0(" 'timepoint' => ",timepointVec[j],","),
paste0(" 'sampleDbCols' => array("),
paste0(
sampleCols <- paste0("'norm_counts_", sort(dfTemp2$sample.id), "_TPM'"),
collapse = ","
),
")),"
)
}
scriptVec[length(scriptVec)] <- gsub(")),", ")))),",scriptVec[length(scriptVec)])
}
scriptVec[length(scriptVec)] <- gsub(",", ")),",scriptVec[length(scriptVec)])
scriptVec <- c(
scriptVec,
"// New End: Timecourse"
)
return(scriptVec)
}
## Create timecourse string ##
###############################################################################
if (sample.order[1] == "" | is.na(sample.order[1])){
sample.order <- sort(names(database.table)[grep("norm_counts_", names(database.table))])
}
if (count.sample.colors[1] == "" | is.na(count.sample.colors[1])){
count.sample.colors <- rainbow(length(sample.order))
}
if (sample.names[1] == "" | is.na(sample.names[1])){
sample.names <- gsub("norm_counts_", "", sample.order)
sample.names <- gsub("_", " ", sample.names)
}
settingsPhpVec <- c(
"<?php",
"",
"return array("
)
if (publicDataset){
settingsPhpVec <- c(
settingsPhpVec,
" 'public_access' => TRUE,"
)
}
settingsPhpVec <- c(
settingsPhpVec,
" 'lab' => array(",
paste0(" 'name' => '",obj@parameterList$labname," DB'"),
" ),",
"",
" /*",
" * Experiment settings",
" */",
paste0(" 'data_db_name' => '",obj@dbDetailList$primDataDB,"',"),
" 'data_db' => array(",
paste0(" 'cat_table_name' => '",obj@parameterList$cat.ref.db.table,"'"),
" ),",
"",
paste0(" 'rnaseq_db_table' => '",obj@parameterList$rnaseqdbTableName,"',"),
paste0(" 'primary_gene_symbol' => '",obj@parameterList$geneIDcolumn,"',"),
paste0(" 'ptm_display_column' => '",obj@parameterList$displayPTMcolumn,"',"),
"",
" 'heatmap' => array(",
paste0(" 'upper_limit' => ",upper_heatmap_limit,","),
paste0(" 'lower_limit' => ",lower_heatmap_limit,","),
paste0(" 'headline' => '",obj@parameterList$heamap.headline.text,"',"),
paste0(" 'pointer' => '",pointer,"'"),
" ),",
""
)
## Add sample array ##
settingsPhpVec <- c(
settingsPhpVec,
" 'samples' => array("
)
for (i in 1:length(sample.order)){
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",sample.order[i],"' => array("),
paste0(" 'color' => '",sample.colors[i],"',"),
paste0(" 'name' => '",sample.names[i],"'"),
" )"
)
if (i < length(sample.order)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" ), // End samples array"
)
## Done adding samples ##
## Adding barchart parameters ##
settingsPhpVec <- c(
settingsPhpVec,
" // bar chart",
" 'count_table' => array(",
paste0(" 'headline' => '", obj@parameterList$count.table.headline,"',"),
paste0(" 'sidelabel' => '", obj@parameterList$count.table.sidelabel,"'"),
" ),"
)
## Done adding barchart parameters ##
## Adding timecourse parameters ##
if (!is.null(timepointName)){
tempVec <- createTSparams(
dfDesign = Obio@dfDesign,
timepointName = timepointName
)
settingsPhpVec <- c(
settingsPhpVec,
tempVec
)
}
## Done adding timecourse parameters ##
## Adding Venn section ##
if (heatmapSampleOrder[1] == ""){
heatmapSampleOrder <- names(df.data)[grep("lg2_avg", names(df.data))]
}
heatMapString <- paste(heatmapSampleOrder, collapse = "','")
heatMapString <- paste0("'", heatMapString,"'")
settingsPhpVec <- c(
settingsPhpVec,
" // Venn Diagram Parameters",
" 'venn' => array(",
paste0(" 'experiments' => array(", heatMapString,"),"),
"",
" 'table' => array(",
" 'col_name_start' => 11,",
" 'low_highlight' => -1,",
" 'high_highlight' => 1",
" ),",
"",
" 'selection' => array("
)
vennCols <- as.vector(NULL, mode = "character")
## Make sure all venn cols are numeric ##
df.data[,vennCols] <- apply(df.data[,vennCols], 2, as.numeric)
for (i in 1:length(venn.slider.selector.strings)){
vennCols <- c(
vennCols,
names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))]
)
}
for (i in 1:length(vennCols)){
colMax <- ceiling(max(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
colMin <- floor(min(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
Vname <- vennCols[i]
Vname <- substr(Vname ,11,100)
Vname <- gsub("^_", "", Vname)
Vname <- gsub("_", " ", Vname)
if (is.numeric(colMax) & is.numeric(colMin)){
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",vennCols[i],"' => array("),
paste0(" 'name' => '",Vname,"',"),
paste0(" 'slider_min' => ", colMin,","),
paste0(" 'slider_max' => ", colMax,","),
paste0(" 'default_low' => ", colMin,","),
paste0(" 'default_high' => ", colMax,""),
" )"
)
}
if (i < length(vennCols)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" )", ## Done with venn array
" )," ## Done with venn array
)
## Done adding Venn section
## Adding scatterplot ##
scatterCols <- as.vector(NULL, mode = "character")
for (i in 1:length(plot.selection.strings)){
scatterCols <- c(
scatterCols,
names(df.data)[grep(plot.selection.strings[i], names(df.data))]
)
}
if (!is.null(pcaDbTable)){
settingsPhpVec <- c(
settingsPhpVec,
" // Scatterplot Parameters",
paste0("'pca' => '", pcaDbTable, "',")
)
}
if (length(scatterCols) > 0){
if (is.null(defaultXcolName)){
defaultXcolName <- scatterCols[1]
}
if (is.null(defaultYcolName)){
defaultXcolName <- scatterCols[2]
}
settingsPhpVec <- c(
settingsPhpVec,
" // Scatterplot Parameters",
" 'scatterplot' => array(",
paste0(" 'default-x' => '",defaultXcolName,"',"),
paste0(" 'default-y' => '",defaultYcolName,"',"),
" 'selection' => array("
)
for (i in 1:length(scatterCols)){
Sname <- scatterCols[i]
Sname <- substr(Sname ,11,100)
Sname <- gsub("^_", "", Sname)
Sname <- gsub("_", " ", Sname)
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",scatterCols[i],"' => array("),
paste0(" 'name' => '",Sname,"'"),
" )"
)
if (i < length(scatterCols)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" )", # close scatterplot selection array
" )", # close scatterplot array
"//End scatterplot" # close scatterplot array
)
}
## Done adding scatterplot ##
## End of file ##
settingsPhpVec <- c(
settingsPhpVec,
");"
)
###########################################################################
## Create settings.php file ##
setwd(webSiteDir)
if (!dir.exists(project_id)){
dir.create(project_id)
}
if (substr(webSiteDir, nchar(webSiteDir), nchar(webSiteDir)) != "/"){
webSiteDir <- paste0(
webSiteDir,
"/"
)
}
FN <- paste0(
webSiteDir,
project_id,
"/settings.php"
)
sink(FN)
for (i in 1:length(settingsPhpVec)){
cat(settingsPhpVec[i]); cat("\n")
}
sink()
## Done creating settings.php file ##
###########################################################################
}
## End: createSettingsFile() ##
###############################################################################
##############################################################################
## (2C) createSettingsJSON() ##
#' @title createSettingsJSON
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#' @import jsonlite
#'
#'
createSettingsJSON <- function(
obj = "object",
df.data = 'database.table',
publicDataset = FALSE,
defaultXcolName = NULL,
defaultYcolName = NULL,
timepointName = NULL,
sample.order = "names(database.table)[grep('norm_counts', names(databse.table))]", #set to "" to go with default sorting
heatmapSampleOrder = "lg2_avg vec",
sample.names = "", # default is sample.order
count.sample.colors = 'rainbow(length(sample.order))',
ptm.colum = "display_ptm",
count.table.headline = "PTM ratio H/L counts for all samples",
count.table.sidelabel = "Counts",
venn.slider.selector.strings = 'c("contrast_x_logFC", "constrast_x_padj")',
plot.selection.strings = 'c(
"_logFC",
"_PCA_",
"_lg10p"
)',
webSiteDir = "/camp/stp/babs/working/boeings/Stefan/protocol_files/github/biologic/src/experiments",
upper_heatmap_limit = 3,
lower_heatmap_limit = -3,
heamap.headline.text = "heamap.headline.text",
project_id = "project_id",
primDataTable = "p123_rna_seq_table",
pcaDbTable = NULL,
pointer = "Gene Symbol:"
){
###############################################################################
## Create timecourse string from dfDesign ##
createTSparams <- function(
dfDesign = Obio@dfDesign,
timepointName = "Timepoint"
) {
tsOrder <- as.numeric(sort(unique(dfDesign$timepoint)))
scriptVec <- as.vector(NULL, mode = "character")
tsList <- list()
tsList[["timecourse_chart"]] <- list(
"timepoint_name" = "Day",
"display_median" = "calculate_median",
"timepoint_array" = list(
paste(tsOrder, collapse = ",")
)
)
# scriptVec <- c(
# scriptVec,
# "// New Begin: Timecourse",
# "'timecourse_chart' => array(",
# " 'timepoint_name' => 'Day',",
# " 'display_median' => 'calculate_median',",
# paste0(" 'timepoint_array' => array(", paste(tsOrder, collapse = ","),"),"),
# " 'datasets' => array("
# )
datasetsList <- list()
if (length(grep("dataseries_order", names(dfDesign))) > 0){
if (length(grep("ts_color", names(dfDesign))) > 0){
dfO <- unique(dfDesign[,c("dataseries", "dataseries_order","ts_color")])
dfO <- dfO[order(dfO$dataseries_order, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- as.vector(dfO$ts_color)
} else {
dfO <- unique(dfDesign[,c("dataseries", "dataseries_order")])
dfO <- dfO[order(dfO$dataseries_order, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- rainbow(length(dataseriesVec))
}
} else {
if (length(grep("ts_color", names(dfDesign))) > 0){
dfO <- unique(dfDesign[,c("dataseries", "ts_color")])
dfO <- dfO[order(dfO$dataseries, decreasing = F),]
dataseriesVec <- as.vector(dfO$dataseries)
dataseriesColVec <- as.vector(dfO$ts_color)
} else {
dataseriesVec <- sort(unique(dfDesign$dataseries))
dataseriesColVec <- rainbow(length(dataseriesVec))
}
}
for (i in 1:length(dataseriesVec)){
dfTemp <- unique(
dfDesign[dfDesign$dataseries == dataseriesVec[i], c("sample.id", "dataseries", "sample.group", "timepoint")]
)
dfTemp <- dfTemp[order(dfTemp$timepoint, decreasing = F),]
timepointVec <- unique(dfTemp$timepoint)
sampleGroupVec <- unique(dfTemp$sample.group)
datasetsList[[dataseriesVec[i]]] <- list(
'color' = dataseriesColVec[i]
)
# scriptVec <- c(
# scriptVec,
# paste0("'",dataseriesVec[i],"' => array("),
# paste0(" 'color' => '",dataseriesColVec[i],"',"),
# paste0(" 'sample_group' => array(")
# )
sample_groupList <- list()
for (j in 1:length(sampleGroupVec)){
dfTemp3 <- unique(dfDesign[dfDesign$sample.group %in% sampleGroupVec, c(timepointName, "sample.group")])
dfTemp3 <- dfTemp3[order(dfTemp3[,timepointName], decreasing = F),]
timepointVec <- as.numeric(dfTemp3[,timepointName])
dfTemp2 <- unique(dfTemp[dfTemp$sample.group == sampleGroupVec[j],])
sample_groupList[[sampleGroupVec[j]]] = list(
"timepoint" = timepointVec[j],
"sampleDbCols" = list(
paste0(
sampleCols <- paste0("'norm_counts_", sort(dfTemp2$sample.id), "_TPM'"),
collapse = ","
)
)
)
} ## End for loop
datasetsList[[dataseriesVec[i]]] [["sample_group"]] <- sample_groupList
#scriptVec[length(scriptVec)] <- gsub(")),", ")))),",scriptVec[length(scriptVec)])
}
#scriptVec[length(scriptVec)] <- gsub(",", ")),",scriptVec[length(scriptVec)])
tsList[["datasets"]] = datasetsList
return(tsList)
}
## ##
###############################################################################
jsonList <- list()
###############################################################################
## Public access ##
if (publicDataset){
jsonList[["public_access"]] <- TRUE
} else {
jsonList[["public_access"]] <- FALSE
}
## ##
###############################################################################
###############################################################################
## Base parameters ##
if (sample.order[1] == "" | is.na(sample.order[1])){
sample.order <- sort(names(database.table)[grep("norm_counts_", names(database.table))])
}
if (count.sample.colors[1] == "" | is.na(count.sample.colors[1])){
count.sample.colors <- rainbow(length(sample.order))
}
if (sample.names[1] == "" | is.na(sample.names[1])){
sample.names <- gsub("norm_counts_", "", sample.order)
sample.names <- gsub("_", " ", sample.names)
}
jsonList[["lab"]] <- list(
"name" = obj@parameterList$labname,
"data_db_name" = obj@dbDetailList$primDataDB,
"data_db" = list(
"cat_table_name" = obj@parameterList$cat.ref.db.table
),
"rnaseq_db_table" = obj@parameterList$rnaseqdbTableName,
"primary_gene_symbol" = obj@parameterList$geneIDcolumn,
"ptm_display_column" = obj@parameterList$displayPTMcolumn,
"heatmap" = list(
"upper_limit" = upper_heatmap_limit,
"lower_limit" = lower_heatmap_limit,
"headline" = obj@parameterList$heamap.headline.text,
"pointer" = pointer
)
)
###############################################################################
## Create sampe list ##
samplesList <- list()
for (i in 1:length(sample.order)){
samplesList[[sample.order[i]]] <- list(
"color" = sample.colors[i],
"name" = sample.names[i]
)
}
jsonList[["samples"]] <- samplesList
## ##
###############################################################################
###############################################################################
## Add barchart parameters ##
jsonList[["count_table"]] <- list(
"headline" = obj@parameterList$count.table.headline,
"sidelabel" = obj@parameterList$count.table.sidelabel
)
## Done ##
###############################################################################
###############################################################################
## Timecourse parameters ##
if (!is.null(timepointName)){
tsList <- createTSparams(
dfDesign = Obio@dfDesign,
timepointName = timepointName
)
jsonList[["timecourse_chart"]] <- tsList
}
## Done ##
###############################################################################
###############################################################################
## Venn parameters ##
if (heatmapSampleOrder[1] == ""){
heatmapSampleOrder <- names(df.data)[grep("lg2_avg", names(df.data))]
}
heatMapString <- paste(heatmapSampleOrder, collapse = '\\",\\"')
#heatMapString <- paste0("'", heatMapString,"'")
vennList <- list(
"experiments" = heatmapSampleOrder,
"table" = list(
"col_name_start" = 11,
"low_highlight" = -1,
"high_highlight" = 1
)
)
## Make selectionList
selectionList <- list()
vennCols <- as.vector(NULL, mode = "character")
## Make sure all venn cols are numeric ##
df.data[,vennCols] <- apply(df.data[,vennCols], 2, as.numeric)
for (i in 1:length(venn.slider.selector.strings)){
vennCols <- c(
vennCols,
names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))]
)
}
for (i in 1:length(vennCols)){
colMax <- ceiling(max(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
colMin <- floor(min(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
Vname <- vennCols[i]
Vname <- substr(Vname ,11,100)
Vname <- gsub("^_", "", Vname)
Vname <- gsub("_", " ", Vname)
if (is.numeric(colMax) & is.numeric(colMin)){
selectionList[[vennCols[i]]] <- list(
"name" = Vname,
"slider_min" = colMin,
"slider_max" = colMax,
"default_low" = colMin,
"default_high" = colMax
)
} # end if
}
vennList[["selection"]] <- selectionList
jsonList[["venn"]] <- vennList
## Done ##
###############################################################################
###############################################################################
## Add PCA ##
if (!is.null(pcaDbTable)){
jsonList[["pca"]] <- pcaDbTable
}
## Done ##
###############################################################################
###############################################################################
## Add scatterplot ##
scatterCols <- as.vector(NULL, mode = "character")
for (i in 1:length(plot.selection.strings)){
scatterCols <- c(
scatterCols,
names(df.data)[grep(plot.selection.strings[i], names(df.data))]
)
}
if (length(scatterCols) > 0){
scatterplotList <- list()
if (is.null(defaultXcolName)){
defaultXcolName <- scatterCols[1]
}
if (is.null(defaultYcolName)){
defaultXcolName <- scatterCols[2]
}
scatterplotList[["default-x"]] <- defaultXcolName
scatterplotList[["default-y"]] <- defaultYcolName
selectionList <- list()
for (i in 1:length(scatterCols)){
Sname <- scatterCols[i]
Sname <- substr(Sname ,11,100)
Sname <- gsub("^_", "", Sname)
Sname <- gsub("_", " ", Sname)
selectionList[[scatterCols[i]]] <- list(
"name" = Sname
)
}
scatterplotList[["selection"]] <- selectionList
jsonList[["scatterplot"]] <- scatterplotList
}
## Done ##
###############################################################################
json <- jsonlite::toJSON(jsonList,pretty=TRUE,auto_unbox=TRUE)
return(json)
}
## End: createSettingsFile() ##
###############################################################################
###############################################################################
## ##
###############################################################################
##
#' @title createSeuratSettingsFile
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createSeuratSettingsFile <- function(
#obj = "object",
labname = "labname",
prim.data.db = "primDataDB",
cat.ref.db.table = "cat.ref.db.table",
rnaseqdbTableName = "rnaseqdbTableName",
geneIDcolumn = "geneIDcolumn",
displayPTMcolumn = "",
heatmap.headline.text = "Heatmap",
count.table.headline = "",
count.table.sidelabel = "",
df.data = 'database.table',
defaultXcolName = NULL,
defaultYcolName = NULL,
sample.order = "names(database.table)[grep('norm_counts', names(databse.table))]", #set to "" to go with default sorting
heatmapSampleOrder = "lg2_avg vec",
sample.names = "", # default is sample.order
count.sample.colors = 'rainbow(length(sample.order))',
ptm.colum = "display_ptm",
venn.slider.selector.strings = 'c("contrast_x_logFC", "constrast_x_padj")',
plot.selection.strings = 'c(
"_logFC",
"_PCA_",
"_lg10p"
)',
webSiteDir = "/camp/stp/babs/working/boeings/Stefan/protocol_files/github/biologic/src/experiments",
upper_heatmap_limit = 3,
lower_heatmap_limit = -3,
heamap.headline.text = "heamap.headline.text",
project_id = "project_id",
primDataTable = "p123_rna_seq_table",
pcaDbTable = NULL,
pointer = "Gene Symbol:"
){
if (sample.order[1] == "" | is.na(sample.order[1])){
sample.order <- sort(names(database.table)[grep("norm_counts_", names(database.table))])
}
if (count.sample.colors[1] == "" | is.na(count.sample.colors[1])){
count.sample.colors <- rainbow(length(sample.order))
}
if (sample.names[1] == "" | is.na(sample.names[1])){
sample.names <- gsub("norm_counts_", "", sample.order)
sample.names <- gsub("_", " ", sample.names)
}
settingsPhpVec <- c(
"<?php",
"",
"return array(",
" 'lab' => array(",
paste0(" 'name' => '",labname," DB'"),
" ),",
"",
" /*",
" * Experiment settings",
" */",
paste0(" 'data_db_name' => '",primDataDB,"',"),
" 'data_db' => array(",
paste0(" 'cat_table_name' => '",cat.ref.db.table,"'"),
" ),",
"",
paste0(" 'rnaseq_db_table' => '",rnaseqdbTableName,"',"),
paste0(" 'primary_gene_symbol' => '",geneIDcolumn,"',"),
paste0(" 'ptm_display_column' => '",displayPTMcolumn,"',"),
"",
" 'heatmap' => array(",
paste0(" 'upper_limit' => ",upper_heatmap_limit,","),
paste0(" 'lower_limit' => ",lower_heatmap_limit,","),
paste0(" 'headline' => '",heamap.headline.text,"',"),
paste0(" 'pointer' => '",pointer,"'"),
" ),",
""
)
## Add sample array ##
settingsPhpVec <- c(
settingsPhpVec,
" 'samples' => array("
)
for (i in 1:length(sample.order)){
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",sample.order[i],"' => array("),
paste0(" 'color' => '",sample.colors[i],"',"),
paste0(" 'name' => '",sample.names[i],"'"),
" )"
)
if (i < length(sample.order)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" ), // End samples array"
)
## Done adding samples ##
## Adding barchart parameters ##
settingsPhpVec <- c(
settingsPhpVec,
" // bar chart",
" 'count_table' => array(",
paste0(" 'headline' => '", count.table.headline,"',"),
paste0(" 'sidelabel' => '", count.table.sidelabel,"'"),
" ),"
)
## Done adding barchart parameters ##
## Adding Venn section ##
if (heatmapSampleOrder[1] == ""){
heatmapSampleOrder <- names(df.data)[grep("lg2_avg", names(df.data))]
}
heatMapString <- paste(heatmapSampleOrder, collapse = "','")
heatMapString <- paste0("'", heatMapString,"'")
settingsPhpVec <- c(
settingsPhpVec,
" // Venn Diagram Parameters",
" 'venn' => array(",
paste0(" 'experiments' => array(", heatMapString,"),"),
"",
" 'table' => array(",
" 'col_name_start' => 11,",
" 'low_highlight' => -1,",
" 'high_highlight' => 1",
" ),",
"",
" 'selection' => array("
)
vennCols <- as.vector(NULL, mode = "character")
## Make sure all venn cols are numeric ##
df.data[,vennCols] <- apply(df.data[,vennCols], 2, as.numeric)
for (i in 1:length(venn.slider.selector.strings)){
vennCols <- c(
vennCols,
names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))]
)
}
for (i in 1:length(vennCols)){
colMax <- ceiling(max(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
colMin <- floor(min(as.numeric(df.data[,vennCols[i]]), na.rm = TRUE))
Vname <- vennCols[i]
Vname <- substr(Vname ,11,100)
Vname <- gsub("^_", "", Vname)
Vname <- gsub("_", " ", Vname)
if (is.numeric(colMax) & is.numeric(colMin)){
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",vennCols[i],"' => array("),
paste0(" 'name' => '",Vname,"',"),
paste0(" 'slider_min' => ", colMin,","),
paste0(" 'slider_max' => ", colMax,","),
paste0(" 'default_low' => ", colMin,","),
paste0(" 'default_high' => ", colMax,""),
" )"
)
}
if (i < length(vennCols)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" )", ## Done with venn array
" )," ## Done with venn array
)
## Done adding Venn section
## Adding scatterplot ##
scatterCols <- as.vector(NULL, mode = "character")
for (i in 1:length(plot.selection.strings)){
scatterCols <- c(
scatterCols,
names(df.data)[grep(plot.selection.strings[i], names(df.data))]
)
}
if (!is.null(pcaDbTable)){
settingsPhpVec <- c(
settingsPhpVec,
" // Scatterplot Parameters",
paste0("'pca' => '", pcaDbTable, "',")
)
}
if (length(scatterCols) > 0){
if (is.null(defaultXcolName)){
defaultXcolName <- scatterCols[1]
}
if (is.null(defaultYcolName)){
defaultXcolName <- scatterCols[2]
}
settingsPhpVec <- c(
settingsPhpVec,
" // Scatterplot Parameters",
" 'scatterplot' => array(",
paste0(" 'default-x' => '",defaultXcolName,"',"),
paste0(" 'default-y' => '",defaultYcolName,"',"),
" 'selection' => array("
)
for (i in 1:length(scatterCols)){
Sname <- scatterCols[i]
Sname <- substr(Sname ,11,100)
Sname <- gsub("^_", "", Sname)
Sname <- gsub("_", " ", Sname)
settingsPhpVec <- c(
settingsPhpVec,
paste0(" '",scatterCols[i],"' => array("),
paste0(" 'name' => '",Sname,"'"),
" )"
)
if (i < length(scatterCols)){
settingsPhpVec[length(settingsPhpVec)] <- paste0(
settingsPhpVec[length(settingsPhpVec)], ","
)
}
}
settingsPhpVec <- c(
settingsPhpVec,
" )", # close scatterplot selection array
" )", # close scatterplot array
"//End scatterplot" # close scatterplot array
)
}
## Done adding scatterplot ##
## End of file ##
settingsPhpVec <- c(
settingsPhpVec,
");"
)
###########################################################################
## Create settings.php file ##
setwd(webSiteDir)
if (!dir.exists(project_id)){
dir.create(project_id)
}
if (substr(webSiteDir, nchar(webSiteDir), nchar(webSiteDir)) != "/"){
webSiteDir <- paste0(
webSiteDir,
"/"
)
}
FN <- paste0(
webSiteDir,
project_id,
"/settings.php"
)
sink(FN)
for (i in 1:length(settingsPhpVec)){
cat(settingsPhpVec[i]); cat("\n")
}
sink()
## Done creating settings.php file ##
###########################################################################
}
##
###############################################################################
## ##
###############################################################################
###############################################################################
## (20) Function add2labCatSelectionDBtable() ##
## ##
#' @title add2labCatSelectionDBtable
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
add2labCatSelectionDBtable <- function (
df.ref = "reference table",
cat_group_name = "Tybulewicz lab",
reference.gene.vector = "reference gene vector",
ref.gene.vec.id = "hgnc_symbol",
cat_views = NA
){
sel.vec = c(
"cat_name",
"cat_id",
"hgnc_symbol",
"mgi_symbol",
"cat_item_size",
"comments_1"
)
df.ref <- unique(df.ref[,sel.vec])
df.ref[["cat_group"]] = cat_group_name
df.ref[["cat_count"]] = 0
df.ref[["cat_weight"]] = 0
df.ref[["cat_views"]] = 0
for (i in 1:nrow(df.ref)){
gene.vector <- df.ref[i, ref.gene.vec.id]
gene.vector <- unlist(strsplit(gene.vector, ";"))
## Remove numeric values if present e.g. gene_name(1.0)
gene.vector <- sapply(gene.vector, function(x) unlist(strsplit(x, "\\("))[1])
gene.vector <- as.vector(na.omit(gene.vector))
gene.vector <- gene.vector[gene.vector != ""]
a = sum(relevant.genes %in% gene.vector)
b = sum(toupper(relevant.genes) %in% gene.vector)
if (a >= b){
df.ref[i, "cat_count"] = a
} else {
df.ref[i, "cat_count"] = b
}
}
if (i%%100 == 0){
print(cat(i))
}
df.ref[df.ref$cat_item_size > 0, "cat_weight"] <- round(
df.ref[df.ref$cat_item_size > 0, "cat_count"]/
df.ref[df.ref$cat_item_size > 0, "cat_item_size"],
2
)
df.ref$hgnc_symbol <- NULL
df.ref$mgi_symbol <- NULL
return(df.ref)
}
## End Function add2labCatSelectionDBtable() ##
###############################################################################
###############################################################################
## (2) Create.website.parameters ##
###############################################################################
#' @title create.website.parameters
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.website.parameters <- function(
df.data,
gene.id.column = "hgnc_symbol",
ptm.colum = "display_ptm",
lab_id = "st_lab",
user_ids = c("project", "st_lab_all", "thomas.mercer"),
project_id = "stl1",
download_result_table = "20160823.logFC.datatable.txt",
download_cat_enrichment_table = "p50.interesting.categories.txt",
database = "stl_data",
reference_categories_db = "reference_categories_db_new",
labname = "Tooze",
rnaseqdbTableName,
lab.categories.table = "st_lab_categories",
sample.order = names(df.data)[grepl("norm_counts_",names(df.data))],
#set to "" to go with default sorting
count.sample.colors = "",
count.table.headline = "PTM ratio H/L counts for all samples",
count.column.chart.x.axis.label = "hrs",
count.table.sidelabel = "Counts",
webSiteDir = "C:/xampp/htdocs/",
heamap.headline.text = "log2FC(SILAC) Heatmap",
upper_heatmap_limit = 3,
lower_heatmap_limit = -3,
slider.selection.name = "logFC",
presentation.file = "",
number_of_slides = "",
default.sequence = "_____ARK_______",
use.logFC.columns.for.heatmap = FALSE,
peptide.view.link = "",
create.2d.scatterplot.button =FALSE,
low_highlight = -1,
high_highlight =1,
display.qc = FALSE,
display.pca = FALSE,
display.report = FALSE,
pca.table.name = "",
gsea.cat.lines = NA,
timecourse.cat.lines = NA,
venn.slider.selector.strings = c("contrast_x_logFC", "constrast_x_padj"),
plot.selection.strings = NA, #strings to grep from col names for plot display
plate.view.db.table = NA,
plate.view.column.vec = NA,
cat.seletion.table.vec = c(
"reference_categories_db_new",
"cat_selection_default",
),
localhost = "localhost",
createOutputFile = TRUE
){
#############################################################################
## Create login.ini.php ##
#############################################################################
setwd(webSiteDir)
dirExist = length(grep(paste0("^",project_id,"$"), list.files()))
if (dirExist == 0){
dir.create(paste(webSiteDir, project_id, sep=""))
}
## Create outputs ##
setwd(paste(webSiteDir, project_id, sep=""))
dirExist = length(grep("outputs", list.files()))
if (dirExist == 0){
dir.create(paste(webSiteDir, project_id, "/outputs", sep=""))
}
#############################################################################
## Create start.ini.php######### ##
#############################################################################
# sink(file = "start.ini.php")
# cat('<?php');cat("\n"); cat("\n");
# cat(paste0('$host ="',localhost,'";'));cat("\n");
# cat('$user = "logincheck";');cat("\n");
# cat('$pwd = "O2ktmuTHqx7V";');cat("\n");
# cat('$location = "search.result.php";');cat("\n");
# cat(paste('$lab_id = "', lab_id,'";', sep=""));cat("\n");
# cat(paste('$project_id = "', project_id,'";', sep=""));cat("\n");
# #Create user array
# user.array = '$user_id_array = array('
# for (i in 1:length(user_ids)){
# user.array = paste(user.array, '"',user_ids[i], '", ', sep="")
# }
#
# user.array = substr(user.array, 1, nchar(user.array)-2)
# user.array = paste(user.array, ');', sep="")
# cat(user.array);cat("\n"); cat("\n");
# cat('?>');cat("\n"); cat("\n");
# sink()
## Done with start ini ##
#############################################################################
#############################################################################
## Create output table ##
df.output <- df.data
df.output$row_names <- NULL
df.output$row_id <- NULL
df.output$for_GSEA_gene_chip <- NULL
df.output$entrezgene <- NULL
df.output$associated_gene_name <- NULL
df.output$cluster_order <- NULL
df.output$cluster_id <- NULL
names(df.output) <- gsub("contrast_1_", "", names(df.output))
names(df.output) <- gsub("contrast_2_", "", names(df.output))
names(df.output) <- gsub("contrast_3_", "", names(df.output))
names(df.output) <- gsub("contrast_4_", "", names(df.output))
names(df.output) <- gsub("contrast_5_", "", names(df.output))
names(df.output) <- gsub("contrast_6_", "", names(df.output))
names(df.output) <- gsub("contrast_7_", "", names(df.output))
names(df.output) <- gsub("contrast_8_", "", names(df.output))
names(df.output) <- gsub("contrast_9_", "", names(df.output))
names(df.output) <- gsub("contrast_x_", "", names(df.output))
names(df.output) <- gsub("contrast_X_", "", names(df.output))
pos <- grep("lg2_avg", names(df.output))
if (length(pos)> 0){
df.output <- df.output[,-pos]
}
df.output <- unique(df.output)
## Done creating output table ##
#############################################################################
## Check if outputs folder needs to be made ##
if (length(grep("outputs/", download_result_table)) > 0){
if (!dir.exists("outputs")){
dir.create("outputs")
}
}
if (createOutputFile){
write.table(
df.output, paste0(
download_result_table,
".txt"
),
row.names=FALSE,
sep="\t"
)
}
print(
paste0(
"Convert ",
paste0(download_result_table,
".txt"
),
" to .xlsx."
)
)
#############################################################################
## Create layout.ini.php ##
sink(file = "layout.ini.php")
cat('<?php');cat("\n"); cat("\n");
cat('######################################');cat("\n");
cat('#Basic Parameters #');cat("\n");
cat('######################################');cat("\n");
cat('######################################');cat("\n");
cat('##Set pwd ##');cat("\n");
cat('if (file_exists("distii/dist.txt")){');cat("\n");
cat(' $fh = fopen("distii/dist.txt", "r");');cat("\n");
cat(' $pwd = fgets($fh);');cat("\n");
cat(' fclose($fh);');cat("\n");
cat(paste0( '$host ="',localhost,'";'));cat("\n");
cat('} else if (file_exists("../../util/babs_db.php")){');cat("\n");
cat(" require_once('../../util/babs_db.php');");cat("\n");
cat(paste0(' $host ="clvd0-db-u-t-08.thecrick.test";'));cat("\n");
cat('} else {');cat("\n");
cat(' echo "Error: Database password could not be established.";');cat("\n");
cat('}');cat("\n");
cat('######################################');cat("\n");
cat('$user = "biologic_website";');cat("\n");
cat(paste('$database = "',database,'";', sep=''));cat("\n");
cat(paste('$lab_id = "', lab_id,'";', sep=""));cat("\n"); cat("\n");
cat(paste('$project_id = "', project_id,'";', sep=""));cat("\n"); cat("\n");
#Create user array
##
## Get user array from db ##
user.array = ' $user_id_array = array('
for (i in 1:length(user_ids)){
user.array = paste(user.array, '"',user_ids[i], '", ', sep="")
}
user.array = substr(user.array, 1, nchar(user.array)-2)
user.array = paste(user.array, ');', sep="")
cat("#if (!isset($_SESSION['userArray'])){");cat("\n");
cat('# $userSQLquery = "SELECT DISTINCT experiment_viewers FROM project_db_table WHERE experiment_id = :project_name";');cat("\n");
cat('# $userArray = query_db(');cat("\n");
cat('# $sql_query = $userSQLquery,');cat("\n");
cat('# $host= $host,');cat("\n");
cat('# $user= $user,');cat("\n");
cat('# $pwd= $pwd,');cat("\n");
cat('# $dbname="reference_categories_db_new",');cat("\n");
cat("# $bindParam_name_array = array(':project_name'),");cat("\n");
cat('# $bindParam_value_array = array($project_id)');cat("\n");
cat('# );');cat("\n");
cat("#\n");
cat('# if (!empty($userArray)){');cat("\n");
cat('# $row = $userArray[0];');cat("\n");
cat('# extract($row);');cat("\n");
cat('# $experiment_viewers = trim($experiment_viewers, ";");');cat("\n");
cat('# $user_id_array = explode(";", $experiment_viewers);');cat("\n");
cat('# } else {');cat("\n");
cat(user.array);cat("\n"); cat("\n");
cat('# }');cat("\n");
cat('#}');cat("\n");cat("\n");cat("\n");
##
cat(paste('$labname="',labname, '";',sep=''));cat("\n");
cat(paste('$primary_gene_symbol = "', gene.id.column, '";', sep="")); cat("\n");
if (gene.id.column == "hgnc_symbol"){
cat(paste('$ensembl_id = "ENSG";', sep="")); cat("\n");
} else {
cat(paste('$ensembl_id = "ENSMUSG";', sep="")); cat("\n");
}
cat(paste('$ptm_display_column = "', ptm.colum, '";', sep="")); cat("\n");
if (display.report){
cat('#########################################');cat("\n");
cat('## Report ##');cat("\n");
cat('#########################################');cat("\n");
cat('$display_report = TRUE;');cat("\n");
cat('$reportDB = "internal_categories";');cat("\n");
cat('$reportTable = "experiment_reports";');cat("\n");
}
cat('#########################################');cat("\n");
cat('#Search.result.php #');cat("\n");
cat('#########################################');cat("\n");
cat(paste('$count_column_chart_x_axis_label = "', count.column.chart.x.axis.label, '";', sep="")); cat("\n");
cat(paste('$count_column_chart_y_axis_label = "', count.table.sidelabel, '";', sep="")); cat("\n");
cat('######################################');cat("\n");
cat('#Table settings #');cat("\n");
cat('######################################');cat("\n");
cat('$narrow = 299;');cat("\n");
cat('$wide = 600;');cat("\n");
all.genes = sort(unique(df.data[,gene.id.column]))
all.genes = all.genes[all.genes != ""]
all.genes = paste(all.genes, collapse = "','")
all.genes = paste("'", all.genes, "'", sep="")
all.genes = paste('$all_genes = "[', all.genes,']";', sep="")
cat(all.genes);cat("\n");
cat(paste0('$low_highlight=' , low_highlight, ';'));cat("\n");
cat(paste0('$high_highlight=' , high_highlight, ';'));cat("\n");
cat('$wide = 600;');cat("\n");
cat('######################################');cat("\n");
cat('#QC #');cat("\n");
cat('######################################');cat("\n");
if (display.qc){
cat('$display_qc = TRUE;');cat("\n");
} else {
cat('$display_qc = FALSE;');cat("\n");
}
cat('$wide = 600;');cat("\n");
cat('######################################');cat("\n");
cat('#PCA #');cat("\n");
cat('######################################');cat("\n");
if (display.pca){
cat('$display_pca = TRUE;');cat("\n");
cat(paste0('$pca_db_table ="',pca.table.name,'";'));cat("\n");
} else {
cat('$display_pca = FALSE;');cat("\n");
}
cat('$wide = 600;');cat("\n");
cat('######################################');cat("\n");
cat('#Other Variables #');cat("\n");
cat('######################################');cat("\n");
cat('$peptide_view_link = "";');cat("\n");
cat('$display_plate_view = "";');cat("\n");
cat('$display_ptm = "";');cat("\n");
cat('######################################');cat("\n");
cat('#Report #');cat("\n");
cat('######################################');cat("\n");
if (display.report){
cat('$display_report = TRUE;');cat("\n");
} else {
cat('$display_report = FALSE;');cat("\n");cat("\n");
}
cat('######################################');cat("\n");
cat('#Data tables #');cat("\n");
cat('######################################');cat("\n");
cat(paste('$cat_database = "',reference_categories_db,'";', sep=''));cat("\n");
cat(paste('$ref_database = "',reference_categories_db,'";', sep=''));cat("\n");
db = paste('$rnaseq_db_table = "',rnaseqdbTableName, '";', sep='')
cat(db); cat("\n");
cat(paste('$lab_category_table="',lab.categories.table,'";', sep=""));cat("\n");
cat(paste('$download_result_table = "',download_result_table,'";', sep=''));cat("\n");
cat(paste('$download_cat_enrichment_table = "',download_cat_enrichment_table,'";', sep=''));cat("\n");
if (gene.id.column == "mgi_symbol"){
cat('$taxon_id="10090";'); cat("\n");
} else if (gene.id.column == "Dmel_symbol"){
cat('$taxon_id="7227";'); cat("\n");
}
else {
cat('$taxon_id="9606";'); cat("\n");
}
cat('########################################');cat("\n");
cat('#2D plot array #');cat("\n");
cat('########################################');cat("\n");
if (is.na(plot.selection.strings[1])){
#Create plot selection string
if (ptm.colum == ""){
string = names(df.data)[grep("contrast", names(df.data))]
} else {
string = append(names(df.data[grep("logFC", names(df.data))]), names(df.data[grep("int", names(df.data))]))
}
string <- string[string != "logFC_cut_off"]
## If present, make contrast_1_logFC and contrast_1_lg10p first entries ##
posLogFC <- grep("contrast_1_logFC", string)
posLg10p <- grep("contrast_1_lg10p", string)
if ((length(posLogFC) > 0) & (length(posLg10p) > 0)){
string <- c(
string[posLogFC],
string[posLg10p],
string
)
}
string = paste(string, collapse = "','")
string = paste("$plot_selection = array('", string, "');", sep="")
cat(string);cat("\n");
} else {
string <- vector(mode="character", length=0)
for (i in 1:length(plot.selection.strings)){
string = append(string, names(df.data)[grep(plot.selection.strings[i], names(df.data))])
}
string <- string[string != "logFC_cut_off"]
## If present, make contrast_1_logFC and contrast_1_lg10p first entries ##
posLogFC <- grep("contrast_1_logFC", string)
posLg10p <- grep("contrast_1_lg10p", string)
if ((length(posLogFC) > 0) & (length(posLg10p) > 0)){
string <- c(
string[posLogFC],
string[posLg10p],
string
)
}
string = paste(string, collapse = "','")
string = paste("$plot_selection = array('", string, "');", sep="")
cat(string);cat("\n");
}
#Create 2D-scatterplot button
if (create.2d.scatterplot.button){
cat("$create_2d_scatterplot_button=TRUE;");cat("\n");
}
if (!is.na(plate.view.db.table) & (!is.na(plate.view.column.vec[1]))){
cat('########################################');cat("\n");
cat('#Plate View Optons #');cat("\n");
cat('########################################');cat("\n");
cat(paste0("$display_plate_view = TRUE ;"));cat("\n");
cat(paste0("$plate_view_db_table ='",plate.view.db.table, "';"));cat("\n");
pv.string <- paste(plate.view.column.vec, collapse = "','")
pv.string <- paste0("$plate_view_selection = array('", pv.string, "');")
cat(pv.string);cat("\n");
}
#Create table.display string
#The setup below will create a table displaying logFC changes only in the data table
cat('########################################');cat("\n");
cat('#Table display array #');cat("\n");
cat('########################################');cat("\n");
string = names(df.data)[grep("contrast", names(df.data))]
string1 = string[grep("logFC", string)]
#string2 = string[grep("padj", string)]
## Remove LRT padj ##
#string = append(string1, string2)
string <- string1
string = paste(string, collapse = "','")
string = paste("$table_display_columns = array('", string, "');", sep="")
cat(string);cat("\n");
cat('$col_name_start = 11;');cat("\n");
cat('########################################');cat("\n");
cat('#Sample order array #');cat("\n");
cat('########################################');cat("\n");
#Make sample array string
if (sample.order[1] == ""){
sample.order = names(df.data[grep("counts", names(df.data))])
}
sample.string = paste(sample.order, collapse = "', '")
sample.string = paste("$sample_array = array('", sample.string, "');", sep="")
cat(sample.string);cat("\n");
#In this case, I need the follownig pattern A, B, B, B,
cat('########################################');cat("\n");
cat('#Sample colors #');cat("\n");
cat('########################################');cat("\n");
library(RColorBrewer)
#Make sample_color_string Give each sample group a different color Needs to be of the same length and in the same order as
# sample.string
n.samples = length(sample.order)
if (count.sample.colors[1] == "" | length(count.sample.colors) != length(sample.order)){
selcol <- colorRampPalette(brewer.pal(12,"Set3"))
selcol2 <- colorRampPalette(brewer.pal(9,"Set1"))
cols = selcol2(n.samples);
} else {
cols = count.sample.colors
}
col.string = "$sample_color_array = array('"
for (i in 1:length(cols)){
col.string = paste(col.string,cols[i],"','",sep="")
}
col.string = substr(col.string, 1, nchar(col.string)-2)
col.string = paste(col.string, ");", sep="")
cat(col.string);cat("\n");
cat('########################################');cat("\n");
cat('#Venn Selection #');cat("\n");
cat('########################################');cat("\n");
## Edited 20170314
string <- vector(mode="character", length=0)
for (i in 1:length(venn.slider.selector.strings)){
string = append(string, names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))])
}
string = paste(string, collapse = "','")
string = paste("$venn_selection = array('", string, "');", sep="")
cat(string);cat("\n");
#Venn ranges
cat(paste('$slider_selection_name = "',slider.selection.name,'";', sep=""));cat("\n");
#This array defines the overall slider range
string <- vector(mode="character", length=0)
for (i in 1:length(venn.slider.selector.strings)){
string = append(string, names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))])
}
slider.min = '$venn_slider_min_array = array('
for (i in 1:length(string)){
if (length(grep("padj", string[i])) > 0){
slider.min = paste(
slider.min,
0,
", ",
sep=""
)
} else {
slider.min = paste(slider.min,
round(min(as.numeric(df.data[,string[i]]), na.rm=T)-1),
", ",
sep=""
)
}
}
slider.min = substr(slider.min, 1, nchar(slider.min)-2)
slider.min = paste(slider.min, ");", sep="")
cat(slider.min);cat("\n")
slider.max = '$venn_slider_max_array = array('
for (i in 1:length(string)){
if (length(grep("padj", string[i])) > 0){
slider.max = paste(slider.max,
1,
", ",
sep=""
)
} else {
slider.max = paste(slider.max,
round(max(as.numeric(df.data[,string[i]]), na.rm=T)+1),
", ",
sep=""
)
}
}
slider.max = substr(slider.max, 1, nchar(slider.max)-2)
slider.max= paste(slider.max, ");", sep="")
cat(slider.max);cat("\n")
#This array defines the default positions of the sliders
slider.min = '$default_venn_low_array = array('
for (i in 1:length(string)){
if (length(grep("padj", string[i])) > 0){
slider.min = paste(slider.min,
0,
", ",
sep=""
)
} else {
slider.min = paste(slider.min,
round(min(as.numeric(df.data[,string[i]]), na.rm=T)-1),
", ",
sep=""
)
}
}
slider.min = substr(slider.min, 1, nchar(slider.min)-2)
slider.min = paste(slider.min, ");", sep="")
cat(slider.min);cat("\n")
slider.max = '$default_venn_high_array = array('
for (i in 1:length(string)){
if (length(grep("padj", string[i])) > 0){
if (i ==1){
sliderValue <- 0.05
} else {
sliderValue <- 1
}
slider.max = paste(slider.max,
sliderValue,
", ",
sep=""
)
} else {
slider.max = paste(slider.max,
round(max(as.numeric(df.data[,string[i]]), na.rm=T)+1),
", ",
sep=""
)
}
}
slider.max = substr(slider.max, 1, nchar(slider.max)-2)
slider.max= paste(slider.max, ");", sep="")
cat(slider.max);cat("\n")
#Make experiment array
#This array is used for heatmap display
if (use.logFC.columns.for.heatmap){
experiment.string = names(df.data)[grep("logFC_", names(df.data))]
experiment.string = experiment.string[grep("contrast", experiment.string)]
} else {
experiment.string = names(df.data)[grep("lg2_avg_", names(df.data))]
}
experiment.string = paste(experiment.string, collapse = "','")
experiment.string = paste("$experiment_array = array('", experiment.string, "');", sep="")
cat(experiment.string);cat("\n");
#This sink will be removed and joined with the GSEA parameters in the final version
#1 Create environment
#jsl1 folder
#dist and distii folders
###########################
#Create display parameters#
###########################
cat("##################"); cat("\n");
cat("#Heatmap settings#"); cat("\n");
cat("##################"); cat("\n");
cat(paste('$upper_heatmap_limit = ',upper_heatmap_limit,';', sep="")); cat("\n");
cat(paste('$lower_heatmap_limit = ',lower_heatmap_limit, ';',sep="")); cat("\n");
cat(paste('$hm_headline = "<h2>',heamap.headline.text,'</h2><br>";', sep="")); cat("\n");
cat('$heatmap_pointer = "lg2(SILAC Ratio)";');
cat("\n");
cat("##################"); cat("\n");
cat("#Count table #"); cat("\n");
cat("##################"); cat("\n");
cat(paste('$count_table_headline = "',count.table.headline,'";', sep="")); cat("\n");
cat(paste('$count_table_sidelabel = "',count.table.sidelabel,'";', sep="")); cat("\n");
# GSEA
if (!is.na(gsea.cat.lines[1])){
for (k in 1:length(gsea.cat.lines)){
cat(gsea.cat.lines[k]); cat("\n");
}
} else {
cat("#################"); cat("\n");
cat("#GSEA Parameters#"); cat("\n");
cat("#################"); cat("\n");
cat("# Not set."); cat("\n");
cat("\n");
}
#enriched_categories_table = paste('$enriched_categories_table="',enriched.categories.db.table.name,'";', sep='')
#cat(enriched_categories_table); cat("\n");
#cat(paste('$image_link_array = "',image.link.array,'";', sep="")); cat("\n");
#enriched.cols = paste('$enriched_padj_cols = "',enriched.padj.cols,'";', sep='')
#cat(enriched.cols); cat("\n");
# timecourse
if (!is.na(timecourse.cat.lines[1])){
for (k in 1:length(timecourse.cat.lines)){
cat(timecourse.cat.lines[k]); cat("\n");
}
} else {
cat("########################"); cat("\n");
cat("# Timecourse Parameters#"); cat("\n");
cat("########################"); cat("\n");
cat("# Not set."); cat("\n");
cat("\n");
}
if (ptm.colum != ""){
cat("#################"); cat("\n");
cat("#Motif array #"); cat("\n");
cat("#################"); cat("\n");
string = "$motif_default_array = array('"
for (i in 1:nchar(default.sequence)){
string = paste(string, substr(default.sequence,i,i), "','", sep="")
}
string = substr(string, 1, nchar(string)-2)
string = paste(string, ");", sep="")
cat(string); cat("\n");
}
if (presentation.file != ""){
cat("##################"); cat("\n");
cat("#Presentation #"); cat("\n");
cat("##################"); cat("\n");
cat(paste("$download_presentation ='", presentation.file, "';", sep="")); cat("\n");
cat(paste("$number_of_slides =", number_of_slides, ";", sep="")); cat("\n");
}
if (peptide.view.link != ""){
cat("##########################"); cat("\n");
cat("#PeptdideView section #"); cat("\n");
cat("##########################"); cat("\n");
cat(paste("$peptide_view_link ='", peptide.view.link, "';", sep="")); cat("\n");
}
cat("\n");
cat("###############################################################################"); cat("\n");
cat("## CategoryView parameters ##"); cat("\n");
cat("###############################################################################"); cat("\n");
cat(paste0("$cat_selection_db = '",cat.seletion.table.vec[1], "';")); cat("\n");
cat(paste0("$cat_selection_db_table = '",cat.seletion.table.vec[2], "';")); cat("\n");
cat("\n");
cat("?>"); cat("\n");
sink()
}
## End of function ##
###############################################################################
###############################################################################
## (2B) createSettingsFile() ##
# createSettingsFile <- function(
# df.data = 'database.table',
# sample.order = "names(database.table)[grep('norm_counts', names(databse.table))]", #set to "" to go with default sorting
# heatmapSampleOrder = "lg2_avg vec",
# sample.names = "", # default is sample.order
# count.sample.colors = 'rainbow(length(sample.order))',
# ptm.colum = "display_ptm",
# count.table.headline = "PTM ratio H/L counts for all samples",
# count.table.sidelabel = "Counts",
# venn.slider.selector.strings = 'c("contrast_x_logFC", "constrast_x_padj")',
# plot.selection.strings = 'c(
# "_logFC",
# "_PCA_",
# "_lg10p"
# )',
# webSiteDir = "/camp/stp/babs/working/boeings/Stefan/protocol_files/github/biologic/src/experiments",
# upper_heatmap_limit = 3,
# lower_heatmap_limit = -3,
# heamap.headline.text = "heamap.headline.text",
# project_id = "project_id",
# primDataTable = "p123_rna_seq_table"
# ){
# if (sample.order[1] == "" | is.na(sample.order[1])){
# sample.order <- sort(names(database.table)[grep("norm_counts_", names(database.table))])
# }
#
# if (count.sample.colors[1] == "" | is.na(count.sample.colors[1])){
# count.sample.colors <- rainbow(length(sample.order))
# }
#
# if (sample.names[1] == "" | is.na(sample.names[1])){
# sample.names <- gsub("norm_counts_", "", sample.order)
# sample.names <- gsub("_", " ", sample.names)
# }
#
# settingsPhpVec <- c(
# "<?php",
# "",
# "return array(",
# " 'lab' => array(",
# paste0(" 'name' => '",labname," DB'"),
# " ),",
# "",
# " /*",
# " * Experiment settings",
# " */",
# paste0(" 'data_db_name' => '",prim.data.db,"',"),
# " 'data_db' => array(",
# paste0(" 'cat_table_name' => '",cat.ref.db.table,"'"),
# " ),",
# "",
# paste0(" 'rnaseq_db_table' => '",primDataTable,"',"),
# paste0(" 'primary_gene_symbol' => '",gene.id.column,"',"),
# paste0(" 'ptm_display_column' => '",ptm.colum,"',"),
# "",
# " 'heatmap' => array(",
# paste0(" 'upper_limit' => ",upper_heatmap_limit,","),
# paste0(" 'lower_limit' => ",lower_heatmap_limit,","),
# paste0(" 'headline' => '",heamap.headline.text,"',"),
# " 'pointer' => 'lg2(SILAC Ratio)'",
# " ),",
# ""
# )
#
# ## Add sample array ##
# settingsPhpVec <- c(
# settingsPhpVec,
# " 'samples' => array("
# )
#
# for (i in 1:length(sample.order)){
# settingsPhpVec <- c(
# settingsPhpVec,
# paste0(" '",sample.order[i],"' => array("),
# paste0(" 'color' => '",sample.colors[i],"',"),
# paste0(" 'name' => '",sample.names[i],"'"),
# " )"
# )
# if (i < length(sample.order)){
# settingsPhpVec[length(settingsPhpVec)] <- paste0(
# settingsPhpVec[length(settingsPhpVec)], ","
# )
# }
# }
# settingsPhpVec <- c(
# settingsPhpVec,
# " ), // End samples array"
# )
#
# ## Done adding samples ##
#
# ## Adding barchart parameters ##
# settingsPhpVec <- c(
# settingsPhpVec,
# " // bar chart",
# " 'count_table' => array(",
# paste0(" 'headline' => '", count.table.headline,"',"),
# paste0(" 'sidelabel' => '", count.table.sidelabel,"'"),
# " ),"
# )
# ## Done adding barchart parameters ##
#
# ## Adding Venn section ##
# if (heatmapSampleOrder[1] == ""){
# heatmapSampleOrder <- names(df.data)[grep("lg2_avg", names(df.data))]
# }
#
# heatMapString <- paste(heatmapSampleOrder, collapse = "','")
# heatMapString <- paste0("'", heatMapString,"'")
#
# settingsPhpVec <- c(
# settingsPhpVec,
# " // Venn Diagram Parameters",
# " 'venn' => array(",
# paste0(" 'experiments' => array(", heatMapString,"),"),
# "",
# " 'table' => array(",
# " 'col_name_start' => 11,",
# " 'low_highlight' => -1,",
# " 'high_highlight' => 1",
# " ),",
# "",
# " 'selection' => array("
# )
#
# vennCols <- as.vector(NULL, mode = "character")
# for (i in 1:length(venn.slider.selector.strings)){
# vennCols <- c(
# vennCols,
# names(df.data)[grep(venn.slider.selector.strings[i], names(df.data))]
# )
# }
#
# for (i in 1:length(vennCols)){
# colMax <- ceiling(max(df.data[,vennCols[i]], na.rm = TRUE))
# colMin <- floor(min(df.data[,vennCols[i]], na.rm = TRUE))
#
# Vname <- vennCols[i]
# Vname <- substr(Vname ,11,100)
# Vname <- gsub("^_", "", Vname)
# Vname <- gsub("_", " ", Vname)
#
# if (is.numeric(colMax) & is.numeric(colMin)){
# settingsPhpVec <- c(
# settingsPhpVec,
# paste0(" '",vennCols[i],"' => array("),
# paste0(" 'name' => '",Vname,"',"),
# paste0(" 'slider_min' => ", colMin,","),
# paste0(" 'slider_max' => ", colMax,","),
# paste0(" 'default_low' => ", colMin,","),
# paste0(" 'default_high' => ", colMax,""),
# " )"
# )
# }
#
# if (i < length(vennCols)){
# settingsPhpVec[length(settingsPhpVec)] <- paste0(
# settingsPhpVec[length(settingsPhpVec)], ","
# )
# }
#
#
# }
#
# settingsPhpVec <- c(
# settingsPhpVec,
# " )", ## Done with venn array
# " )," ## Done with venn array
# )
# ## Done adding Venn section
#
# ## Adding scatterplot ##
# scatterCols <- as.vector(NULL, mode = "character")
# for (i in 1:length(plot.selection.strings)){
# scatterCols <- c(
# scatterCols,
# names(df.data)[grep(plot.selection.strings[i], names(df.data))]
# )
# }
#
# settingsPhpVec <- c(
# settingsPhpVec,
# " // Scatterplot Parameters",
# " 'scatterplot' => array(",
# " 'selection' => array("
# )
#
# for (i in 1:length(scatterCols)){
# Sname <- scatterCols[i]
# Sname <- substr(Sname ,11,100)
# Sname <- gsub("^_", "", Sname)
# Sname <- gsub("_", " ", Sname)
#
# settingsPhpVec <- c(
# settingsPhpVec,
# paste0(" '",scatterCols[i],"' => array("),
# paste0(" 'name' => '",Sname,"'"),
# " )"
# )
#
# if (i < length(scatterCols)){
# settingsPhpVec[length(settingsPhpVec)] <- paste0(
# settingsPhpVec[length(settingsPhpVec)], ","
# )
# }
#
#
# }
#
# settingsPhpVec <- c(
# settingsPhpVec,
# " )", # close scatterplot selection array
# " )", # close scatterplot array
# "//End scatterplot" # close scatterplot array
# )
#
# ## Done adding scatterplot ##
#
# ## End of file ##
# settingsPhpVec <- c(
# settingsPhpVec,
# ");"
# )
#
# ###########################################################################
# ## Create settings.php file ##
# setwd(webSiteDir)
# if (!dir.exists(project_id)){
# dir.create(project_id)
# }
#
#
# if (substr(webSiteDir, nchar(webSiteDir), nchar(webSiteDir)) != "/"){
# webSiteDir <- paste0(
# webSiteDir,
# "/"
# )
# }
#
# FN <- paste0(
# webSiteDir,
# project_id,
# "/settings.php"
# )
#
# sink(FN)
# for (i in 1:length(settingsPhpVec)){
# cat(settingsPhpVec[i]); cat("\n")
# }
# sink()
#
# ## Done creating settings.php file ##
# ###########################################################################
# }
#
# ## End: createSettingsFile() ##
# ###############################################################################
###############################################################################
## (47) listExistingProjects() ##
#' @title listExistingProject
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
listExistingProjects <- function(
user = "boeings",
password = "",
host = "www.biologic-db.org",
dbname = "reference_categories_db_new",
dbtable = "project_description_table"
){
library(RMySQL)
dbDB <- dbConnect(MySQL(), user = user, password = password, host = host, dbname=dbname)
ResVec = sort(as.vector(dbGetQuery(dbDB, paste0("SELECT DISTINCT project_name FROM ", dbtable))[,1]))
dbDisconnect(dbDB)
return(ResVec)
}
## End of function (47) ##
###############################################################################
###############################################################################
## (44) createNewProject ##
#' @title createNewProject
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createNewProject <- function(
dbname = "reference_categories_db_new",
dbtable = "project_description_table",
password = db.pwd,
project_name = "G2 Cell Cycle Checkpoint",
project_lab = ";Parker;",
project_description = "This project investigates the influence of protein kinase C variants on the G2 cell cycle checkpoint."
){
# Retrieve list of existing projects
dfProjectTable <- import.db.table.from.db(
dbname = dbname,
dbtable = dbtable,
password = password
)
dfProjectTable$row_names <- NULL
dfProjectTable <- unique(dfProjectTable)
## Make safety copy ##
FN <- paste0(
hpc.mount,
"Projects/tybulewiczv/edina.schweighoffer/102_VTL_DEV_build_projectView/basedata/",
project.code,
"backupForProjectDescriptionTable.txt"
)
write.table(
dfProjectTable,
FN,
row.names = FALSE,
sep = "\t"
)
## Add new row ##
cols <- names(dfProjectTable)
NewRow <- data.frame(t(unlist(sapply(cols, function(x) get(x)))))
if (sum(!(names(NewRow) %in% names(dfProjectTable))) == 0){
dfProjectTable <- rbind(
dfProjectTable,
NewRow
)
}
upload.datatable.to.database(
host = host,
user = db.user,
password = db.pwd,
prim.data.db = dbname,
dbTableName = dbtable,
df.data = dfProjectTable,
db.col.parameter.list = list(
"VARCHAR(50) CHARACTER SET utf8 COLLATE utf8_general_ci" = c("project_lab"),
"VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci" = c("project_name"),
"TEXT CHARACTER SET utf8 COLLATE utf8_general_ci" = c("project_description"),
"BIGINT(8) NULL DEFAULT NULL" = c("row_names")
),
new.table = TRUE
)
}
## Done adding new project ##
###############################################################################
###############################################################################
## (45) List project in projects table ##
#' @title addProject2ProjectTable
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
addProject2ProjectTable <- function(
dbname = "reference_categories_db_new",
dbtable = "project_db_table",
password = db.pwd,
experiment_id= project_id,
experiment_question = "Which genes are candidates for G2M checkpoint regulation?",
experiment_description = "<b>RNAi screen</b> for genes that affect the G2M checkpoint",
experiment_owner = ";Katharina Deiss;Nicola Longwood;",
experiment_lab = paste0(";",labname,";"),
experiment_viewers = paste0(";", paste(user_ids, collapse = ";"),";"),
experiment_project = ";G2 Cell Cycle Checkpoint;",
experiment_type = experiment.type,
experiment_details = "Experimental details will follow here",
experiment_x_coordinate = ";Screen;",
experiment_x_coordinate_unit = "condition",
experiment_link = paste0(
"<a href='../",
project_id,
"/report.php' class='btn btn-success btn-lg' role='button'>Primary Data »</a>"
),
experiment_title = ""
){
# Retrieve list of existing projects
dfProjectTable <- import.db.table.from.db(
dbname = "reference_categories_db_new",
dbtable = "project_db_table",
password = password
)
dfProjectTable <- dfProjectTable[dfProjectTable$experiment_id != "",]
dfProjectTable$row_names <- NULL
## Make safety copy ##
FN <- paste0(
hpc.mount,
"Projects/tybulewiczv/edina.schweighoffer/102_VTL_DEV_build_projectView/basedata/",
project.code,
"backupForProjectTable.txt"
)
write.table(
dfProjectTable,
FN,
row.names = FALSE,
sep = "\t"
)
## Add new row ##
cols <- names(dfProjectTable)
NewRow <- data.frame(t(unlist(sapply(cols, function(x) get(x)))))
if (sum(!(names(NewRow) %in% names(dfProjectTable))) == 0){
dfProjectTable <- rbind(
dfProjectTable,
NewRow
)
}
upload.datatable.to.database(
host = host,
user = db.user,
password = db.pwd,
prim.data.db = dbname,
dbTableName = dbtable,
df.data = dfProjectTable,
db.col.parameter.list = list(
"VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci" = c("experiment_"),
"TEXT CHARACTER SET utf8 COLLATE utf8_general_ci" = c("experiment_description", "experiment_details"),
"BIGINT(8) NULL DEFAULT NULL" = c("row_names")
),
new.table = TRUE
)
}
## Done listing project ##
###############################################################################
###############################################################################
## Function: (32) createSRAdownloadScript() ##
#' @title createSRAdownloadScript
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createSRAdownloadScript <- function(
sra.id.vector = "sra.id.vector",
gse.id.vector = "",
datadir = "",
module.load.cmd = "module use /camp/stp/babs/working/software/modules/all;module load sratoolkit/2.8.2-1",
fastqDir = "./"
){
sink(paste0(sra.id.vector, ".sra.download.instructions.sh"))
cat("#!/bin/sh"); cat('\n');
cat("## Change into download directory ##"); cat('\n');
cat(paste0("cd ", fastqDir)); cat("\n");
cat(module.load.cmd); cat('\n');
## Download by project ##
wget.cmd <- paste0(
"wget -m ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/",
substr(sra.id.vector, 1, 6),
"/",
sra.id.vector,
"/")
cat("## Download SRA library ##"); cat('\n');
cat(wget.cmd); cat('\n');
mv.cmd <- paste0(
"mv ",
fastqDir,
"ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByStudy/sra/SRP/",
substr(sra.id.vector, 1, 6),
"/",
sra.id.vector,
"/* ./"
)
rm.cmd <- paste0(
"rm -r ftp-trace.ncbi.nlm.nih.gov"
)
cat("## Organize files ##"); cat('\n');
cat(mv.cmd); cat('\n');
cat(rm.cmd); cat('\n');
## Make list of folders ##
# create an array with all the filer/dir inside ~/myDir
arr.cmd <- paste0(
"arr=(",
"./",
"*)"
)
cat("## Create sample array ##"); cat('\n');
cat(arr.cmd); cat('\n');
## fastq-dump command ##
fastq.dump.cmd <- paste0(
'sbatch --wrap "',
'fastq-dump --outdir ',
fastqDir,
' --gzip --skip-technical --readids --read-filter pass --dumpbase --split-files --origfmt $directory.sra',
'" --job-name=',
project.code ,
' -c 1 --mem-per-cpu=1000 -o ',
'fastqdump.slurm >> commands.txt'
)
# iterate through array using a counter
## convert to FASTQ ##
cat('for ((i=0; i<${#arr[@]}; i++)); do'); cat('\n');
cat(' #do something to each element of array'); cat('\n');
cat(' echo "Processing ${arr[$i]} ..."'); cat('\n');
cat(' directory=${arr[$i]}'); cat('\n');
cat(' mv $directory/* ./'); cat('\n');
cat(' rm -r $directory'); cat('\n');
cat(' ## fastq-dump cmd'); cat('\n');
cat(module.load.cmd); cat('\n');
cat(fastq.dump.cmd); cat('\n');
cat(' ## Estimating read length ##'); cat('\n');
cat('filename=$directory__pass_1.fastq.gz'); cat('\n');
cat('if [ -f "$filename" ];'); cat('\n');
cat('then'); cat('\n');
cat("zcat $filename | awk 'NR%2==0' | awk '{print length($1)}' | head"); cat('\n');
cat(' fi'); cat('\n');
cat('filename=$directory__pass_1.fastq.gz'); cat('\n');
cat('if [ -f "$filename" ];'); cat('\n');
cat('then'); cat('\n');
cat("zcat $filename | awk 'NR%2==0' | awk '{print length($1)}' | head"); cat('\n');
cat('fi'); cat('\n');
cat('done'); cat('\n');
cat('## Output files will be named like SRR2979627_pass_1.fastq.gz'); cat('\n');
if (gse.id.vector != ""){
cat('Get GSE annotation'); cat('\n');
cat(
paste0(
"wget https://ftp.ncbi.nlm.nih.gov/geo/series/",
substr(gse.id.vector, 1, 5),
"nnn/",
substr(gse.id.vector, 1, 10),
"/matrix/",
gse.id.vector,
"_series_matrix.txt.gz -O ",
datadir
)
); cat('\n');
cat(
paste0(
"gunzip ",
datadir,
gse.id.vector,
"_series_matrix.txt.gz"
)
); cat('\n');
}
sink()
## create documentation
sra.docu.vec <- c(
paste0("SRA project id:", sra.id.vector),
paste0("Download date/time:" , date())
)
print("If this shell script was generated on a windows computer run:")
print(paste0("tr -d '\r' <",paste0(sra.id.vector, ".sra.download.instructions.sh"),"> conv.",paste0(sra.id.vector, ".sra.download.instructions.sh")))
return(sra.docu.vec)
}
## For ERA downloads use: For example, the files submitted in the SRA Submission ERA007448 are available at: ftp://ftp.sra.ebi.ac.uk/vol1/ERA007/ERA007448/
## End of function: createSRAdownloadScript() ##
###############################################################################
###############################################################################
## Function: (32b) createSRRdownloadScript() ##
#' @title createSRRdownloadScript
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
createSRRdownloadScript <- function(
srr.id.vector = "srr.id.vector",
sra.id.vector = "sra.id.vector",
gse.id.vector = "",
datadir = "",
module.load.cmd = "module use /camp/stp/babs/working/software/modules/all;module load sratoolkit/2.8.2-1",
fastqDir = "./",
project.code = "project.code"
){
scriptVec <- as.vector(NULL, mode = "character")
scriptVec <- c(
scriptVec,
"#!/bin/sh",
"\n",
"## Change into download directory ##",
module.load.cmd,
"\n",
paste0("vdb-config --set /repository/user/default-path=", fastqDir),
"\n",
paste0("vdb-config --set /repository/user/main/public/root=", fastqDir),
"\n",
"\n"
)
# create an array with all the filer/dir inside ~/myDir
srrIDstoDownload <- paste(srr.id.vector, collapse = " ")
arr.cmd <- paste0(
"arr=(",srrIDstoDownload,")"
)
scriptVec <- c(
scriptVec,
"## Create sample array ##",
arr.cmd,
"\n"
)
## fastq-dump command ##
fastq.dump.cmd <- paste0(
'sbatch --time=12:00:00 --wrap "',
'fastq-dump --outdir ',
fastqDir,
' --gzip --skip-technical --readids --read-filter pass --dumpbase --split-files --origfmt $file',
'" --job-name=',
project.code ,
' -c 1 --mem-per-cpu=1000 -o ',
'fastqdump.slurm >> commands.txt'
)
# iterate through array using a counter
## convert to FASTQ ##
scriptVec <- c(
scriptVec,
'for ((i=0; i<${#arr[@]}; i++)); do',
' #do something to each element of array',
' echo "Processing ${arr[$i]} ..."',
' file=${arr[$i]}',
paste0('# mv $directory/* '), fastqDir ,
'# rm -r $file',
' ## fastq-dump cmd',
module.load.cmd,
fastq.dump.cmd,
' ## Estimating read length ##',
'filename=$directory__pass_1.fastq.gz',
'if [ -f "$filename" ];',
'then',
"zcat $filename | awk 'NR%2==0' | awk '{print length($1)}' | head",
'fi',
'done',
'## Output files will be named like SRR2979627_pass_1.fastq.gz'
)
if (gse.id.vector != ""){
scriptVec <- c(
scriptVec,
'Get GSE annotation',
paste0(
"wget https://ftp.ncbi.nlm.nih.gov/geo/series/",
substr(gse.id.vector, 1, 5),
"nnn/",
substr(gse.id.vector, 1, 10),
"/matrix/",
gse.id.vector,
"_series_matrix.txt.gz -O ",
datadir
),
paste0(
"gunzip ",
datadir,
gse.id.vector,
"_series_matrix.txt.gz"
)
)
}
scriptVec <- c(
scriptVec,
paste0("# SRA project id:", sra.id.vector),
paste0("# Download date/time:" , date())
)
## Create shell script
sink(paste0(sra.id.vector, ".srr.download.instructions.sh"))
for (i in 1:length(scriptVec)){
cat(scriptVec[i])
cat("\n")
}
sink()
## Done
print("If this shell script was generated on a windows computer run:")
print(paste0("tr -d '\r' <",paste0(sra.id.vector, ".srr.download.instructions.sh"),"> conv.",paste0(sra.id.vector, ".srr.download.instructions.sh")))
return(scriptVec)
}
## For ERA downloads use: For example, the files submitted in the SRA Submission ERA007448 are available at: ftp://ftp.sra.ebi.ac.uk/vol1/ERA007/ERA007448/
## End of function: createSRAdownloadScript() ##
###############################################################################
###############################################################################
# (9C) Create timecourse cat lines #
###############################################################################
#' @title create.timecourse.cat.lines
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
create.timecourse.cat.lines <- function(
df.design,
display.median.string = "calculate_median"
# alternative for display.median.string example norm_counts_sample_X_avg > "_avg" Don't forget the underscore.
){
# For the time series, the design file needs to contain the following colums
# To create the datasets vector: a >>dataseries<< column
# To assign data series colors >>dataseries_color<<
# To create the timepoint sample name vectors a >>sample.group<<
# To create the timepoint sample replicate name vectors a >>sample.id<<
# To create the timepoint array an >>timepoint<< column (integers/double (hours, minutes, days))
df.design.select <- unique(df.design[,c("dataseries","dataseries_color", "sample.group", "sample.id", "timepoint")])
df.design.select <- df.design.select[order(df.design.select$timepoint,
df.design.select$dataseries,
df.design.select$sample.id,
df.design.select$sample.group
),]
timecourse.cat.lines <- "#########################"
timecourse.cat.lines <- append(timecourse.cat.lines, "# Timecourse Parameters #")
timecourse.cat.lines <- append(timecourse.cat.lines, "#########################")
#Add Dataseries description
add <- paste(unique(df.design.select$dataseries), collapse = "', '")
add <- paste0("$datasets= array('", add, "');")
timecourse.cat.lines <- append(timecourse.cat.lines, add)
#Add dataset_colors
add <- paste(unique(df.design.select$dataseries_color), collapse = "', '")
add <- paste0("$dataset_colors = array('", add, "');")
timecourse.cat.lines <- append(timecourse.cat.lines, add)
# Create timepoint array
add <- paste(unique(df.design.select$timepoint), collapse = ",")
add <- paste0("$timepoint_array = array(", add, ");")
timecourse.cat.lines <- append(timecourse.cat.lines, add)
# Create sample arrays for each dataseries
dataseries <- unique(df.design.select$dataseries)
for (k in 1:length(dataseries)){
add <- paste0("# Dataseries: ", dataseries[k])
timecourse.cat.lines <- append(timecourse.cat.lines, add)
add <- paste0("$", dataseries[k], " = array('")
s.groups <- unique(df.design.select[df.design.select$dataseries == dataseries[k], "sample.group"])
add.groups <- paste(s.groups, collapse = "','")
add <- paste0(add, add.groups, "');")
timecourse.cat.lines <- append(timecourse.cat.lines, add)
# Add sample names (norm counts + sample.id)
for (l in 1:length(s.groups)){
string <- paste0("$", s.groups[l], " = array('")
s.ids <- unique(df.design.select[df.design.select$sample.group == s.groups[l], "sample.id"])
s.ids <- paste0("norm_counts_", s.ids)
s.ids <- paste(s.ids, collapse = "', '")
string <- paste0(string, s.ids, "');")
timecourse.cat.lines <- append(timecourse.cat.lines, string)
}
}
# Display median parameter
string <- paste0("$display_median = '", display.median.string,"';")
timecourse.cat.lines <- append(timecourse.cat.lines, string)
timecourse.cat.lines <- append(timecourse.cat.lines, "# End timecourse parameters #")
return(timecourse.cat.lines)
}
## End of function ##
###############################################################################
###############################################################################
## (18) Retrieve gene category from db ##
###############################################################################
#' @title retrieve.gene.category.from.db
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
retrieve.gene.category.from.db <- function(
cat_id = "mysigdb_c5_MF__127",
dbname = "reference_categories_db_new",
user = "boeings",
password = "",
host = "www.biologic-db.org",
gene.symbol = "mgi_symbol",
print.cat.name = TRUE
){
library(RMySQL)
table <- unlist(
strsplit(
cat_id, "__"
)
)[1]
## Query category name ##
drv = RMySQL::MySQL()
sql.query = paste0(
"SELECT cat_id, cat_name from ",
table,
" WHERE cat_id = '",
cat_id, "'"
)
dbDB <- dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname = dbname
)
cat.vec = dbGetQuery(
dbDB,
sql.query
)
dbDisconnect(dbDB)
if (print.cat.name){
print(
paste(
"Retrieved category: ",
paste0(
cat.vec$cat_name,
collapse = ", "
)
)
)
print(
paste(
"Retrieved category ID: ",
paste0(
cat.vec$cat_id,
collapse = ", "
)
)
)
}
## Query genes ##
sql.query = paste0(
"SELECT ",
gene.symbol,
" from ",
table,
" WHERE cat_id = '",
cat_id, "'"
)
dbDB <- dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname = dbname
)
cat.vec = dbGetQuery(
dbDB,
sql.query
)[,gene.symbol]
dbDisconnect(dbDB)
cat.vec <- unlist(
strsplit(
cat.vec,
";"
)
)
cat.vec <- cat.vec[!is.na(cat.vec)]
cat.vec <- cat.vec[cat.vec != ""]
cat.vec = as.vector(unique(cat.vec))
return(cat.vec)
}
## End of function ##
###############################################################################
######################################################
# (11) add.category.to.lab.reference.table.hs #
######################################################
###############################################################################
## Function update ##
#' @title add.category.to.lab.reference.table.hs
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
add.category.to.lab.reference.table.hs <- function(
host = 'www.biologic-db.org',
pwd = db.pwd,
user = "boeings",
cat.ref.db = "reference_categories_db_new",
cat.ref.db.table = "cs_lab_categories",
gene.vector = gene.vec,
gene.id = "hgnc_symbol", #options hgnc_symbol, mgi_symbol
mm.hs.conversion.file = "Y:/working/boeings/Projects/reference_data/20160303.homologene.data.txt",
cat_name = "lung_cancer_late_subclonal_driver_genes",
cat_type = "cs_lab",
data_source = "Swanton lab",
comments_1 = "",
comments_2 = "",
new.lab.category.table = FALSE,
cat.description.db = "internal_categories",
cat.description.db.table = "category_description",
cat.description.text = "Subclonal late driver genes, as compiled by the Swanton lab",
lab.name = "Swanton",
replaceExistingCatName = TRUE
) {
############################################################################
## Check if table exists and create it if not
## Done
############################################################################
## Preparind gene.vector ##
gene.vector <- na.omit(gene.vector)
gene.vector <- gene.vector[gene.vector != ""]
gene.vector <- as.vector(unique(gene.vector))
## Annotate genes ##
#########################
#Function################
#########################
make.mm.hs.conversion.table <- function(mm.hs.conversion.file){
cwd = getwd()
df.ncbi = read.delim(mm.hs.conversion.file, header=FALSE, sep="\t", stringsAsFactors = FALSE)
names(df.ncbi)[1] = "conversion_id"
names(df.ncbi)[2] = "taxon_id"
names(df.ncbi)[4] = "gene_name"
#Keep only mouse and human rows
df.hs = unique(df.ncbi[df.ncbi$taxon_id == 9606, c("conversion_id", "gene_name")])
names(df.hs)[2] = "hs_gene_name"
df.ms = unique(df.ncbi[df.ncbi$taxon_id == 10090,c("conversion_id", "gene_name")])
names(df.ms)[2] = "ms_gene_name"
df.conversion = unique(merge(df.hs, df.ms, by.x = "conversion_id", by.y = "conversion_id"))
df.conversion$conversion_id = NULL
setwd(cwd)
return (df.conversion)
}
#end of function
df.conversion = make.mm.hs.conversion.table(mm.hs.conversion.file)
names(df.conversion) = c("hgnc_symbol", "mgi_symbol")
assign(gene.id, gene.vector)
df.new <- data.frame(get(gene.id))
names(df.new) <- gene.id
df.conversion = df.conversion[df.conversion[,gene.id] %in% df.new[, gene.id], ]
df.new = merge(df.new, df.conversion, by.x = gene.id, by.y = gene.id, all=TRUE)
df.new[is.na(df.new)] = ""
df.new = unique(df.new[df.new[,gene.id] != "", ])
formatGeneVec <- function(gene.vec){
gene.vec <- na.omit(gene.vec)
gene.vec <- gene.vec[gene.vec != ""]
}
hgnc_symbol <- paste0(";",paste0(formatGeneVec(df.new$hgnc_symbol), collapse = ";"), ";")
mgi_symbol <- paste0(";",paste0(formatGeneVec(df.new$mgi_symbol), collapse = ";"), ";")
## Created dependent vectors ##
#cat_name = rep(cat_name, length(gene.vector))
#cat_type = rep(cat_type, length(gene.vector))
#data_source = rep(data_source, length(gene.vector))
#comments_1 = rep(comments_1, length(gene.vector))
#comments_2 = rep(comments_2, length(gene.vector))
cat_item_size = length(gene.vector)
############################################################################
## Determine if cat exists ##
query <- paste0(
"SELECT * FROM information_schema.tables WHERE table_schema = '", cat.ref.db, "' AND table_name = '", cat.ref.db.table,"' LIMIT 1;"
)
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = db.pwd,
host = host,
dbname = cat.ref.db
)
dfTest <- DBI::dbGetQuery(dbDB, query)
if (nrow(dfTest) == 0){
query <- paste0("CREATE TABLE ", cat.ref.db.table, " LIKE ag_lab_categories;")
}
res <- DBI::dbGetQuery(dbDB, query)
DBI::dbDisconnect(dbDB)
## Done ##
############################################################################
library(RMySQL)
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = db.pwd,
host = host,
dbname = cat.ref.db
)
query <- paste0(
"SELECT DISTINCT cat_id, cat_name FROM ",
cat.ref.db.table,
" WHERE cat_name = '",cat_name,"'"
)
dfTest <- DBI::dbGetQuery(dbDB, query)
DBI::dbDisconnect(dbDB)
updateCat <- FALSE
if (nrow(dfTest) == 1){
cat_id <- dfTest[,"cat_id"]
updateCat <- TRUE
} else {
dbDB = DBI::dbConnect(RMySQL::MySQL(), user = user, password = pwd, dbname= cat.ref.db ,host = host)
## Set default ##
next.id = 1
max.value = 0
max.value <- as.numeric(
DBI::dbGetQuery(dbDB,
paste(
"SELECT MAX(row_names) FROM ",
cat.ref.db.table, sep=""
)
)
)
if (!is.na(max.value)){
df.ref = DBI::dbGetQuery(dbDB, paste("SELECT DISTINCT cat_id FROM ", cat.ref.db.table, sep=""))
ids = unique(df.ref$cat_id)
ids = as.numeric(sapply(ids, function(x) unlist(strsplit(x, paste(cat.ref.db.table, "__", sep="")))[2]))
next.id = max(ids)+1
} else {
max.value = 0
}
#df.ref = dbGetQuery(dbDB, "SELECT DISTINCT * FROM js_lab_categories WHERE cat_id = 'not_existing'")
DBI::dbDisconnect(dbDB)
cat_id = paste(cat.ref.db.table, "__", next.id, sep="")
}
dbDB = DBI::dbConnect(RMySQL::MySQL(), user = user, password = pwd, dbname= cat.ref.db ,host = host)
df.ref = DBI::dbGetQuery(dbDB, "SELECT DISTINCT * FROM js_lab_categories WHERE cat_id = 'not_existing'")
DBI::dbDisconnect(dbDB)
## Get current reference category format from db ##
## Get default cat column names ##
## Current df.ref column names and order:
#[1] "hgnc_symbol" "mgi_symbol" "cat_id" "cat_name" "cat_type" "data_source" "comments_1"
#[8] "comments_2" "cat_item_size" "row_names"
add.internal.cat.description <- FALSE
if (comments_1[1] == ""){
comments_1 <- paste0("category.description.php?cat_id=", cat_id)
add.internal.cat.description <- TRUE
}
df.cat.new = data.frame(
hgnc_symbol,
mgi_symbol,
cat_id,
cat_name,
cat_type,
data_source,
comments_1,
comments_2,
cat_item_size,
stringsAsFactors = FALSE
)
#Consider only genes that are pesent in df.data
#Prepare data table
#df.ref$row_names = NULL
df.cat.new[["row_names"]] <- 0
if (!updateCat){
df.cat.new[["row_names"]] = max.value+1
}
df.cat.new = df.cat.new[, names(df.ref)]
#Upload to database
dbDB = DBI::dbConnect(RMySQL::MySQL(), user = user, password = pwd, dbname= cat.ref.db, host = host)
if (new.lab.category.table){
DBI::dbGetQuery(dbDB, paste("DROP TABLE IF EXISTS ", cat.ref.db.table, sep=""))
}
uploaded = FALSE
while (!uploaded){
tryCatch({
killDbConnections()
dbDB = DBI::dbConnect(MySQL(), user = user, password = pwd, dbname= cat.ref.db,host = host)
if (updateCat){
query <- paste0(
"UPDATE ", cat.ref.db.table,
" SET hgnc_symbol='",df.cat.new[,"hgnc_symbol"],
"', mgi_symbol='",df.cat.new[,"mgi_symbol"],
#"' cat_id='",df.cat.new[,"cat_id"],
"', cat_name='",df.cat.new[,"cat_name"],
"', cat_type='",df.cat.new[,"cat_type"],
"', data_source='",df.cat.new[,"data_source"],
"', comments_1='",df.cat.new[,"comments_1"],
"', comments_2='",df.cat.new[,"comments_2"],
"', cat_item_size='",df.cat.new[,"cat_item_size"],
"' WHERE cat_id = '", df.cat.new[,"cat_id"], "'"
)
DBI::dbGetQuery(dbDB, query)
} else {
query <- paste0(
"INSERT INTO ", cat.ref.db.table,
" (",
paste0(names(df.cat.new), collapse=", "),
") VALUES ('",
paste0(as.vector(t(df.cat.new[1,])), collapse="','"),
"')"
)
DBI::dbGetQuery(dbDB, query)
# dbWriteTable(
# dbDB,
# paste0(cat.ref.db,".",cat.ref.db.table),
# df.cat.new,
# row.names= FALSE,
# overwrite=FALSE,
# append=TRUE
# )
}
uploaded = TRUE
#dbDisconnect(dbDB)
}, error=function(e){cat("Upload errror :",conditionMessage(e), "\n")})
}
#dbWriteTable(dbDB, dataTable, df.new, row.names= FALSE, overwrite=FALSE, append=TRUE)
if (new.lab.category.table){
DBI::dbGetQuery(dbDB, paste("ALTER TABLE `",cat.ref.db.table,"` ADD UNIQUE(`row_names`)", sep=""))
DBI::dbGetQuery(dbDB, paste("ALTER TABLE `",cat.ref.db.table,"` ADD PRIMARY KEY(`row_names`)", sep=""))
DBI::dbGetQuery(dbDB, paste("ALTER TABLE ",cat.ref.db.table,"
CHANGE `hgnc_symbol` `hgnc_symbol` LONGTEXT CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `mgi_symbol` `mgi_symbol` LONGTEXT CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `cat_name` `cat_name` VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `cat_id` `cat_id` VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `cat_type` `cat_type` VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `data_source` `data_source` VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `comments_1` `comments_1` VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `comments_2` `comments_2` VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci ,
CHANGE `cat_item_size` `cat_item_size` INT(5) NULL DEFAULT NULL,
CHANGE `row_names` `row_names` BIGINT(8) NULL DEFAULT NULL",
sep="")
)
}
DBI::dbDisconnect(dbDB)
# Add description
if (add.internal.cat.description){
## Escape ##
#cat.description.text <- gsub("\\'", "\\'", cat.description.text)
dbDB = DBI::dbConnect(RMySQL::MySQL(), user = user, password = pwd, dbname= cat.description.db,host = host)
insert.query <- paste0("INSERT INTO `",cat.description.db,"`.`",cat.description.db.table,"` (`cat_id`, `cat_name`, `cat_description`, `created_by`, `lab`, `creation_date`) VALUES ('",cat_id[1],"', '",cat_name[1],"', '",cat.description.text,"', '",data_source[1],"', '",lab.name,"', CURDATE())");
DBI::dbGetQuery(dbDB, insert.query)
DBI::dbDisconnect(dbDB)
}
string = paste0(cat_id, " with cat_name ",cat_name, " added.")
print(string)
return(cat_id)
}
## End of function ##
## Done function update ##
###############################################################################
###############################################################################
# (4) createRMDscript #
###############################################################################
#' @title createRMDscript
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="createRMDscript",
def=function(
obj #,
#scriptVecSlot = "scriptVec"
){
scriptVec <- as.vector(Obio@scriptVec)
sink("script.txt")
for (i in 1:length(scriptVec)){
cat(paste0("#", i,"# "));cat(scriptVec[i])
}
sink()
library(knitr)
#(s = system.file("examples", "knitr-spin.R", package = "knitr"))
spin("script.txt") # default markdown
#o = spin(s, knit = FALSE) # convert to Rmd only
#knit2html(o) # compile to HTML
}
)
###############################################################################
## (21) msigdb.gmt2refDB() ##
###############################################################################
#' @title msigdb.gmt2refDB
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
msigdb.gmt2refDB <- function(
df.gmt = "read.delim(gmt.file, header=FALSE, sep = '\t', stringsAsFactors = FALSE)",
host = "www.biologic-db.org",
db.user = "boeings",
pwd = "pwd",
ref.db = "reference_database",
ref.db.table = "reference database table name",
cat_type = "mysigdb_c5",
data_source = "Broad Institute",
keep.gene.values = FALSE,
gene.id = "hgnc_symbol",
create.new.table = FALSE,
mm.hs.conversion.file = "C:/Users/boeing01/Desktop/homologene.data"
){
## Load tab delimited gmt file ##
#setwd(dataDir)
## Load and generate file for ID conversion ##
#mm.hs.conversion.file = "Y:/working/boeings/Projects/reference_data/20160303.homologene.data.txt"
#########################
#Function################
#########################
make.mm.hs.conversion.table <- function(mm.hs.conversion.file){
cwd = getwd()
df.ncbi = read.delim(mm.hs.conversion.file, header=FALSE, sep="\t", stringsAsFactors = FALSE)
names(df.ncbi)[1] = "conversion_id"
names(df.ncbi)[2] = "taxon_id"
names(df.ncbi)[4] = "gene_name"
#Keep only mouse and human rows
df.hs = unique(df.ncbi[df.ncbi$taxon_id == 9606, c("conversion_id", "gene_name")])
names(df.hs)[2] = "hs_gene_name"
df.ms = unique(df.ncbi[df.ncbi$taxon_id == 10090,c("conversion_id", "gene_name")])
names(df.ms)[2] = "ms_gene_name"
df.conversion = unique(merge(df.hs, df.ms, by.x = "conversion_id", by.y = "conversion_id"))
df.conversion$conversion_id = NULL
setwd(cwd)
names(df.conversion) <- c("hgnc_symbol", "mgi_symbol")
return (df.conversion)
}
#end of function
df.conversion <- make.mm.hs.conversion.table(mm.hs.conversion.file)
## Ensure that df.gmt is a data.frame ##
df.gmt <- data.frame(df.gmt)
## Condense gene columns 3:n into column 3##
df.genes <- df.gmt[,3:ncol(df.gmt)]
df.gmt <- df.gmt[,1:2]
df.gmt[["hgnc_symbol"]] = ""
df.gmt[["mgi_symbol"]] = ""
names(df.gmt)[1] <- "cat_name"
names(df.gmt)[2] <- "comments_1"
df.gmt[["cat_item_size"]] = 0
###############################################################################
## Adding all relevant database columns to df.gmt ##
# To be added
col.vec <- c(
"hgnc_symbol",
"mgi_symbol",
"cat_id",
"cat_name",
"cat_type",
"data_source",
"comments_1",
"comments_2",
"cat_item_size",
"row_names"
)
## Check if dbTablename already exists, and if so, get cat_id vector ##
library(RMySQL)
dbDB <- dbConnect(
MySQL(),
user = db.user,
password = db.pwd,
host = host,
dbname = ref.db
)
tables <- as.vector(dbGetQuery(dbDB, paste0("SHOW TABLES IN ", ref.db))[,1])
if (is.na(match(ref.db.table, tables)) | create.new.table){
## if table does not exist yet ##
first.id <- 1
next.row <- 1
} else {
## table exist already and need to be appended ##
cat.ids <- dbGetQuery(dbDB, paste0("SELECT DISTINCT cat_id FROM ", ref.db.table))$cat_id
ids <- as.numeric(sapply(cat.ids, function(x) unlist(strsplit(x, "__"))[2]))
first.id <- max(ids) + 1
next.row <- as.numeric(dbGetQuery(dbDB, paste0("SELECT MAX(row_names) FROM ", ref.db.table)))
next.row <- next.row + 1
}
dbDisconnect(dbDB)
df.gmt[["cat_id"]] = paste0(ref.db.table, "__", (first.id:(first.id + nrow(df.gmt)-1)))
df.gmt[["cat_type"]] <- rep(cat_type, nrow(df.gmt))
df.gmt[["data_source"]]<- rep(data_source, nrow(df.gmt))
df.gmt[["comments_2"]] <- ""
## Done adding all relevant database columns to df.gmt ##
###############################################################################
## Adding mgi and hgnc gene names ##
for (i in 1:nrow(df.gmt)){
gene.vec <- unique(as.vector(t(df.genes[i,])))
gene.vec <- na.omit(gene.vec)
gene.vec <- gene.vec[gene.vec != ""]
df.gmt[i, "cat_item_size"] = length(gene.vec)
df.gene <- data.frame(gene.vec, stringsAsFactors = FALSE)
df.gene[["gene_name"]] <- as.vector(sapply(gene.vec, function(x) unlist(strsplit(x, ","))[1]))
df.gene[["value"]] <- as.vector(sapply(gene.vec, function(x) unlist(strsplit(x, ","))[2]))
if (unique(df.gene$value)[1] == "" | is.na(df.gene$value)[1] ){
numeric.values.present = FALSE
} else {
numeric.values.present = TRUE
}
if (gene.id == "hgnc_symbol"){
names(df.gene) <- gsub("gene_name", "hgnc_symbol", names(df.gene))
names(df.gene) <- gsub("value", "hgnc_value", names(df.gene))
df.temp <- df.conversion[df.conversion$hgnc_symbol %in% as.vector(df.gene$hgnc_symbol),]
#df.gene <- merge(df.gene, df.temp, by.x = "gene_name", by.y = "hgnc_symbol", all= TRUE)
df.gene <- merge(df.gene, df.temp, by.x = "hgnc_symbol", by.y = "hgnc_symbol", all= TRUE)
df.gene[is.na(df.gene)] = ""
df.gene[["mgi_value"]] = ""
df.gene[df.gene$mgi_symbol != "","mgi_value"] = df.gene[df.gene$mgi_symbol != "","hgnc_value"]
} else if (gene.id == "mgi_symbol"){
names(df.gene) <- gsub("gene_name", "mgi_symbol", names(df.gene))
names(df.gene) <- gsub("value", "mgi_value", names(df.gene))
df.temp <- df.conversion[df.conversion$mgi_symbol %in% as.vector(df.gene$mgi_symbol),]
df.gene <- merge(df.gene, df.temp, by.x = "mgi_symbol", by.y = "mgi_symbol", all = TRUE)
df.gene[is.na(df.gene)] = ""
df.gene[["hgnc_value"]] = ""
df.gene[df.gene$mgi_symbol != "","hgnc_value"] = df.gene[df.gene$mgi_symbol != "","mgi_value"]
} else {
print("Error: No valid gene id provided. Allowed: hgnc_symbol or mgi_symbol")
return()
}
if (i%%1000 == 0){
print(paste0(i, " categories processed..."))
}
#print(
# paste0(
# df.gmt[i,1],
# ": total genes:",
# length(gene.vec),
# "; hgnc_symbol:",
# length(unique(df.gene$hgnc_symbol)),
# "; mgi_symbol:",
# length(unique(df.gene$mgi_symbol))
# )
#)
## Check if data values are present ##
# Create hgnc string
df.temp <- unique(df.gene[df.gene$hgnc_symbol != "", c("hgnc_symbol", "hgnc_value")])
if (numeric.values.present){
hgnc.string <- paste0(df.temp$hgnc_symbol, "(",df.temp$hgnc_value,")")
} else {
hgnc.string <- df.temp$hgnc_symbol
}
hgnc.string <- paste0(";", paste0(hgnc.string, collapse = ";"), ";")
df.gmt[i, "hgnc_symbol"] <- hgnc.string
## Create mgi string ##
df.temp <- unique(df.gene[df.gene$mgi_symbol != "", c("mgi_symbol", "mgi_value")])
if (numeric.values.present){
mgi.string <- paste0(df.temp$mgi_symbol, "(",df.temp$mgi_value,")")
} else {
mgi.string <- df.temp$mgi_symbol
}
mgi.string <- paste0(";", paste0(mgi.string, collapse = ";"), ";")
df.gmt[i, "mgi_symbol"] <- mgi.string
}
## Re-order df.gmt ##
df.gmt[["row_names"]] = next.row:(nrow(df.gmt)+ next.row -1)
df.gmt <- df.gmt[,col.vec]
## Upload to database ##
library(RMySQL)
dbDB = dbConnect(
drv = RMySQL::MySQL(),
user = db.user,
password = db.pwd,
dbname = ref.db,
host = host
)
if (create.new.table){
dbGetQuery(
dbDB,
paste(
"DROP TABLE IF EXISTS ",
ref.db.table,
sep = ""
)
)
}
dbWriteTable(
dbDB,
ref.db.table,
df.gmt,
row.names = FALSE,
append = TRUE,
overwrite = FALSE)
if (create.new.table){
resp <- dbGetQuery(
dbDB,
paste(
"ALTER TABLE `",
ref.db.table,
"` ADD UNIQUE(`row_names`)",
sep = "")
)
resp <- dbGetQuery(
dbDB,
paste(
"ALTER TABLE `",
ref.db.table,
"` ADD PRIMARY KEY(`row_names`)",
sep = "")
)
resp <- dbGetQuery(
dbDB,
paste("ALTER TABLE ",
ref.db.table,
" CHANGE `hgnc_symbol` `hgnc_symbol` LONGTEXT CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `mgi_symbol` `mgi_symbol` LONGTEXT CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `cat_name` `cat_name` VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `cat_id` `cat_id` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `cat_type` `cat_type` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `data_source` `data_source` VARCHAR(100) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `comments_1` `comments_1` VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `comments_2` `comments_2` VARCHAR(255) CHARACTER SET utf8 COLLATE utf8_general_ci ,
CHANGE `cat_item_size` `cat_item_size` INT(5) NULL DEFAULT NULL,
CHANGE `row_names` `row_names` BIGINT(8) NULL DEFAULT NULL",
sep = ""
)
)
}
dbDisconnect(dbDB)
} # end of function
## ##
###############################################################################
###############################################################################
## (22) prepareDotPlotData() ##
###############################################################################
## This function is a derivative of the Seurat DotPlot function
#' @title DotPlotSB
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
DotPlotSB <- function (
object,
assay = NULL,
features,
cols = c("lightgrey", "blue"),
col.min = -2.5,
col.max = 2.5,
dot.min = 0,
dot.scale = 6,
group.by = NULL,
split.by = NULL,
scale.by = "radius",
scale.min = NA,
scale.max = NA
) {
assay <- assay %||% DefaultAssay(object = object)
DefaultAssay(object = object) <- assay
scale.func <- switch(
EXPR = scale.by,
size = scale_size,
radius = scale_radius,
stop("'scale.by' must be either 'size' or 'radius'")
)
data.features <- FetchData(
object = object,
vars = features
)
data.features$id <- if (is.null(x = group.by)) {
Idents(object = object)
} else {
object[[group.by, drop = TRUE]]
}
if (!is.factor(x = data.features$id)) {
data.features$id <- factor(x = data.features$id)
}
id.levels <- levels(x = data.features$id)
data.features$id <- as.vector(x = data.features$id)
if (!is.null(x = split.by)) {
splits <- object[[split.by, drop = TRUE]]
if (length(x = unique(x = splits)) > length(x = cols)) {
stop("Not enought colors for the number of groups")
}
cols <- cols[1:length(x = unique(x = splits))]
names(x = cols) <- unique(x = splits)
data.features$id <- paste(data.features$id, splits, sep = "_")
unique.splits <- unique(x = splits)
id.levels <- paste0(rep(x = id.levels, each = length(x = unique.splits)),
"_", rep(x = unique(x = splits), times = length(x = id.levels)))
}
PercentAbove <- function(x, threshold) {
return(length(x = x[x > threshold]) / length(x = x))
}
data.plot <- lapply(
X = unique(x = data.features$id),
FUN = function(ident) {
data.use <- data.features[data.features$id == ident, 1:(ncol(x = data.features) - 1), drop = FALSE]
avg.exp <- apply(
X = data.use,
MARGIN = 2,
FUN = function(x) {
return(mean(x = expm1(x = x)))
}
)
pct.exp <- apply(
X = data.use,
MARGIN = 2,
FUN = PercentAbove,
threshold = 0
)
return(list(avg.exp = avg.exp, pct.exp = pct.exp))
}
)
names(x = data.plot) <- unique(x = data.features$id)
data.plot <- lapply(X = names(x = data.plot), FUN = function(x) {
data.use <- as.data.frame(x = data.plot[[x]])
data.use$features.plot <- rownames(x = data.use)
data.use$id <- x
return(data.use)
})
data.plot <- do.call(what = "rbind", args = data.plot)
dfClust <- data.frame(data.plot %>% pivot_wider(!pct.exp, names_from = features.plot, values_from = avg.exp))
row.names(dfClust) <- dfClust$id
dfClust$id <- NULL
if (ncol(dfClust) >=2){
dfDist <- hclust(d=dist(t(dfClust)), method = "ward.D2")
orderVec <- names(dfClust)[dfDist$order]
} else {
orderVec <- names(dfClust)
}
if (!is.null(x = id.levels)) {
data.plot$id <- factor(x = data.plot$id, levels = id.levels)
}
avg.exp.scaled <- sapply(
X = unique(x = data.plot$features.plot),
FUN = function(x) {
data.use <- data.plot[data.plot$features.plot == x, "avg.exp"]
data.use <- scale(x = data.use)
data.use <- MinMax(
data = data.use,
min = col.min,
max = col.max
)
return(data.use)
}
)
avg.exp.scaled <- as.vector(x = t(x = avg.exp.scaled))
if (!is.null(x = split.by)) {
avg.exp.scaled <- as.numeric(
x = cut(x = avg.exp.scaled,
breaks = 20)
)
}
data.plot$avg.exp.scaled <- avg.exp.scaled
data.plot$features.plot <- factor(
x = data.plot$features.plot,
levels = rev(x = features)
)
data.plot$pct.exp[data.plot$pct.exp < dot.min] <- NA
data.plot$pct.exp <- data.plot$pct.exp * 100
if (!is.null(x = split.by)) {
splits.use <- vapply(
X = strsplit(
x = as.character(x = data.plot$id),
split = "_"
), FUN = "[[", FUN.VALUE = character(length = 1L), 2
)
data.plot$colors <- mapply(FUN = function(color, value) {
return(colorRampPalette(colors = c("grey", color))(20)[value])
}, color = cols[splits.use], value = avg.exp.scaled)
}
color.by <- ifelse(
test = is.null(x = split.by),
yes = "avg.exp.scaled",
no = "colors"
)
if (!is.na(x = scale.min)) {
data.plot[data.plot$pct.exp < scale.min, "pct.exp"] <- scale.min
}
if (!is.na(x = scale.max)) {
data.plot[data.plot$pct.exp > scale.max, "pct.exp"] <- scale.max
}
old <- theme_get()
theme_set(theme_bw())
data.plot$features.plot <- factor(data.plot$features.plot, levels = orderVec)
if (is.factor(object@meta.data$clusterName)){
levels <- levels(object@meta.data$clusterName)
} else {
levels <- unique(object@meta.data$clusterName)
}
data.plot$id <- factor(data.plot$id, levels = levels)
data.plot <- na.omit(data.plot)
plot <- ggplot(
data=data.plot, aes_string(x= "id", y="features.plot")
) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8,angle = 45, vjust = 1, hjust = 1),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12),
legend.position="bottom"
) + geom_point(
aes_string(size = "pct.exp",color = color.by)
#) + coord_fixed(
) + guides(size = guide_legend(title = "Perc Expr")
) + labs(
x = ifelse(test = is.null(x = split.by),
yes = "Identity", no = "Split Identity"),
y = "Features"
)
if (nrow(data.plot) > 10){
plot <- plot + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}
if (!is.null(x = split.by)) {
plot <- plot + scale_color_identity()
} else if (length(x = cols) == 1) {
plot <- plot + scale_color_distiller(palette = cols)
} else {
plot <- plot + scale_color_gradient(low = cols[1], high = cols[2])
}
if (is.null(x = split.by)) {
plot <- plot + guides(color = guide_colorbar(title = "Avg Expr"))
}
theme_set(old)
return(plot)
}
## ##
###############################################################################
#####################################################
# (15) Function to list all tables in a database #
#####################################################
#' @title list.db.tables.in.db
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
list.db.tables.in.db <- function(dbname = "reference_categories_db_new",
user = "boeings",
password = "",
host = "www.biologic-db.org"
){
library(RMySQL)
dbDB <- dbConnect(MySQL(), user = user, password = password, host = host, dbname=dbname)
table.vector = sort(as.vector(dbGetQuery(dbDB, "SHOW TABLES")[,1]))
dbDisconnect(dbDB)
return(table.vector)
}
# End of function
#####################################################
# (16) Function to list all columns in a db table #
#####################################################
#' @title list.db.table.col.names
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
list.db.table.col.names <- function(dbtable = "interpro_categori",
dbname = "reference_categories_db_new",
user = "boeings",
password = "",
host = "www.biologic-db.org"
){
library(RMySQL)
dbDB <- dbConnect(MySQL(), user = user, password = password, host = host, dbname=dbname)
column.vector = sort(as.vector(dbGetQuery(dbDB, paste0("SHOW COLUMNS in ",dbtable))[,1]))
dbDisconnect(dbDB)
return(column.vector)
}
# End of function
###############################################################################
## Hypergeometric test enrichment ##
###############################################################################
## Method Determine row variability ##
#' @title profileCluster
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
setGeneric(
name="profileCluster",
def=function(
# Input gmt file with categories to test: dfGmt
# Output: table with enrichments
obj = "Obio",
markerList = "residualClusterMarkers",
gmtList = "clusterProfilerGMTList",
nTop = 10,
pvalueCutoff = 0.5
) {
library(clusterProfiler)
library(ggplot2)
library(tidyr)
if (Obio@parameterList$geneIDcolumn != "mgi_symbol" & Obio@parameterList$geneIDcolumn != "hgnc_symbol") {
queryGS <- "hgnc_symbol"
} else {
queryGS <- Obio@parameterList$geneIDcolumn
}
if (Obio@parameterList$host == "10.27.241.234"){
urlString <- "biologic.thecrick.org"
} else {
urlString <- "biologic.crick.ac.uk"
}
VersionPdfExt <- paste0(".V", gsub("-", "", Sys.Date()), ".pdf")
## Set plotting colors ##
plotNames <- sort(unique(names(gmtList)))
library(scales)
plotCols <- hue_pal()(length(plotNames))
names(plotCols) <- plotNames
catEnrichmentList <- list()
## markerList is derrived from a gmt file and has in the one position a category description.
## Determine colors ##
library(scales)
enrCols <- hue_pal()(length(gmtList))
names(enrCols) <- names(gmtList)
for (i in 1:length(markerList)){
geneVec <- markerList[[i]][2:length(markerList)]
geneVec <- geneVec[geneVec != ""]
geneVec <- geneVec[!is.na(geneVec)]
first <- TRUE
if (length(geneVec) > 0){
for (j in 1:length(gmtList)){
egmt <- data.frame(
enricher(
geneVec,
TERM2GENE=gmtList[[j]],
pvalueCutoff = pvalueCutoff
)
)
if (!is.null(egmt)){
if (nrow(egmt) > 0){
egmt[["Collection"]] <- substr(names(gmtList)[j], 1,10)
}
if (first){
dfTempEnriched <- egmt
first <- FALSE
} else {
dfTempEnriched <- rbind(
dfTempEnriched,
egmt
)
}
}
}
}
if (!first & nrow(dfTempEnriched > 0)){
dfTempEnriched <- dfTempEnriched[order(dfTempEnriched$p.adjust, decreasing=F),]
catEnrichmentList[[names(markerList)[i]]] <- dfTempEnriched
}
print(paste0(names(markerList)[i], " done."))
}
## Make the plots ##
plotList <- list()
chnkVec <- as.vector(NULL, mode = "character")
for (i in 1:length(catEnrichmentList)){
catName <- names(catEnrichmentList)[i]
catNameString <- gsub("_", " ", names(catEnrichmentList)[i])
dfEnr <- catEnrichmentList[[i]]
dfEnr[["lg10p"]] <- -1*log10(dfEnr$p.adjust)
dfEnr <- unique(dfEnr[,c("ID", "lg10p", "Collection")])
dfEnr[["Cluster"]] <- catNameString
dfEnr <- dfEnr[order(dfEnr$lg10p, decreasing = T),]
if (i ==1){
dfResTable <- dfEnr
} else {
dfResTable <- rbind(
dfResTable,
dfEnr
)
}
if (nTop > nrow(dfEnr)){
tempSel <- nrow(dfEnr)
} else {
tempSel <- nTop
}
dfEnr <- dfEnr[1:tempSel, ]
dfEnr <- dfEnr[order(dfEnr$lg10p, decreasing = F),]
dfEnr$ID <- substr(dfEnr$ID, 1, 40)
dfEnr$ID <- factor(dfEnr$ID, levels = unique(dfEnr$ID))
tag <- paste0("Cell_Types_", names(catEnrichmentList)[i])
tempPlotCols <- plotCols[unique(dfEnr$Collection)]
plotList[[tag]] <- ggplot(
data=dfEnr, aes(x= ID, y=lg10p, fill=Collection)
) + geom_hline(yintercept = c(-1*log10(0.05)), color = "black", size=0.5, lty=2
) + geom_bar(stat="identity", colour="black"
) + coord_flip() + scale_fill_manual("CatType",values = enrCols) + theme(
axis.text.y = element_text(size=8),
axis.text.x = element_text(size=8),
axis.title.y = element_text(size=8),
axis.title.x = element_text(size=8),
axis.line = element_line(colour = "black"),
panel.border = element_rect(colour = "black", fill=NA, size=1),
plot.title = element_text(hjust = 0.5, size = 12)
) + labs(title = paste0(gsub("_", " ", tag)," enriched genes") ,y = "-log10(padj)", x = ""
) + geom_hline(yintercept = 0, color = "black", size=0.5
) + theme_bw()
## Save to file ##
FNbase <- paste0("CellTypeEnrichment_", tag, VersionPdfExt)
FN <- paste0(obj@parameterList$reportFigDir, FNbase)
FNrel <- paste0("report_figures/", FNbase)
pdf(FN)
print(plotList[[tag]])
dev.off()
link <- paste0(
'<a href="https://', urlString, '/',
obj@parameterList$project_id,
'/category-view?category_type=Cell Type Signatures" target="_blank">CategoryView > Cell Signatures</a>'
)
## Create R markdown chunk ##
figLegend <- paste0(
'**Figure ',
figureCount,
'**: Enrichment analysis of cluster gene signatures to infer the cluster cell type. ',
'Download a pdf of this figure <a href="',FNrel,'" target="_blank">here</a>. To view these gene sets in the context of your data, go to ',link,' and find these categories using the search box.'
)
figureCount <- figureCount + 1
NewChnk <- paste0(
"#### ", gsub("_", " ", tag),
"\n```{r enrichr_", tag, ", results='asis', echo=F, eval=TRUE, warning=FALSE, fig.cap='",
figLegend,"'}\n",
"\n",
"\n print(plotList[['",tag,"']])",
"\n cat( '\n')",
"\n\n\n```\n"
)
chnkVec <- c(
chnkVec,
NewChnk
)
}
returnList <- list(
"chnkVec" = chnkVec,
"plotList" = plotList,
"dfResTable" = dfResTable
)
return(returnList)
})
## Done category enrichments
###############################################################################
###############################################################################
# End createRMDscript #
###############################################################################
###############################################################################
## Open data frame in Excel ##
## Function written by Daniel Cook ##
#' @title excel
#'
#' @description Method description
#' @param agree TBD
#' @keywords TBD
#' @export
#'
#'
excel <- function(df) {
f <- paste0(tempdir(),'/', make.names(deparse(substitute(df))),'.',paste0(sample(letters)[1:5],collapse=""), '.csv')
write.csv(df,f)
system(sprintf("open -a 'Microsoft Excel' %s",f))
}
## Usage
# wrap a dataframe directory
##excel(cars)
# Or pipe output with dplyr
## cars %>% excel()
##
###############################################################################
# setGeneric(
# name="seurat2viewer",
# def=function(
# obj,
# assay = "RNA",
# #slot = "data",
# geneSel = NULL,
# params = NULL,
# projectName = "test"
# ) {
#
# if (is.null(params)){
# params <- scanObjParams(obj)
# }
#
#
# dfCoord <- createDfCoord(obj, params = params)
# names(dfCoord) <- gsub("[.]", "_", names(dfCoord))
# params$x_axis <- gsub("[.]", "_", params$x_axis)
# params$y_axis <- gsub("[.]", "_", params$y_axis)
# params$splitPlotsBy <- gsub("[.]", "_", params$splitPlotsBy)
# params$colorPlotsBy <- gsub("[.]", "_", params$colorPlotsBy)
#
# dfExpr <- createDfExpr(obj = testObj, assay = "RNA")
#
#
# dfIDTable <- dfExpr
# dfIDTable[["gene_id"]] <- 0
# dfIDTable <- unique(dfIDTable[,c("gene", "gene_id")])
# dfIDTable <- dfIDTable[order(dfIDTable$gene, decreasing = F), ]
# dfIDTable[["gene_id"]] <- 1:nrow(dfIDTable)
#
#
# ###############################################################################
# ## Add percentage expressed genes ##
#
#
# top30Var <- head(
# x = VariableFeatures(object = testObj),
# 30
# )
#
# my_genes <- rownames(x = testObj@assays$RNA)
# exp <- FetchData(testObj, my_genes)
# ExprMatrix <- round(as.matrix(colMeans(exp > 0)) *100,1)
# colnames(ExprMatrix)[1] <- "count_cut_off"
# dfExprMatrix <- data.frame(ExprMatrix)
# dfExprMatrix[["gene"]] <- row.names(dfExprMatrix)
# dfExprMatrix <- dfExprMatrix[dfExprMatrix$gene %in% top30Var, ]
# dfExprMatrix <- dfExprMatrix[order(dfExprMatrix$count_cut_off, decreasing = T),]
# geneDefault = as.vector(dfExprMatrix[1,"gene"])
#
# ############
# ## Create database
# projectDir <- paste0(projectName)
# dataDir <- paste0(projectDir, "/data")
# paramDir <- paste0(projectDir, "/parameters")
#
# connectDir <- paste0(dataDir, "/connect")
# projectDB <- paste0(projectName, "_DB")
#
# coordTb <- paste0(projectName, "_meta_data")
# exprTb <- paste0(projectName, "_gene_expr_tb")
# geneTb <- paste0(projectName, "_geneID_tb")
#
# if (!dir.exists(projectDir)){
# dir.create(projectDir)
# }
#
# if (!dir.exists(dataDir)){
# dir.create(dataDir)
# }
#
# if (!dir.exists(connectDir)){
# dir.create(connectDir)
# }
#
# if (!dir.exists(paramDir)){
# dir.create(paramDir)
# }
#
# setwd(dataDir)
# dfCoord[["row_names"]] <- 1:nrow(dfCoord)
# conn <- RSQLite::dbConnect(RSQLite::SQLite(), projectDB)
# RSQLite::dbWriteTable(conn, coordTb, dfCoord, overwrite =T)
# cmd.string <- paste0("CREATE INDEX idx_cellID ON ",coordTb," (cellID);")
# res <- DBI::dbGetQuery(
# conn,
# cmd.string
# )
# cmd.string <- paste0("CREATE INDEX idx_rowNames ON ",coordTb," (row_names);")
# res <- DBI::dbGetQuery(
# conn,
# cmd.string
# )
#
# dfExpr[["row_names"]] <- 1:nrow(dfExpr)
# RSQLite::dbWriteTable(conn, exprTb, dfExpr, overwrite =T)
# cmd.string <- paste0("CREATE INDEX idx_rows ON ",exprTb," (row_names);")
# res <- DBI::dbGetQuery(
# conn,
# cmd.string
# )
# cmd.string <- paste0("CREATE INDEX idx_cells ON ",exprTb," (cellID);")
# res <- DBI::dbGetQuery(
# conn,
# cmd.string
# )
#
# RSQLite::dbWriteTable(conn, geneTb, dfIDTable, overwrite =T)
# RSQLite::dbDisconnect(conn)
#
#
# dfID <- data.frame(
# type = "RSQLite",
# url = "",
# id = "",
# id2 = "",
# db = projectDB,
# coordTb,
# exprTb,
# geneTb,
# default = geneDefault
# )
#
# dfkey <- dfID
# save(dfkey, file = "dfkey.rda")
#
# setwd("connect")
# write.table(
# dfID,
# "db.txt",
# sep = "\t",
# row.names = F
# )
#
#
#
# # setwd("../../parameters")
# # yamlList <- list(
# # "XYsel" = unique(params[["x_axis"]]),
# # "XYsel_names" = unique(names(params[["x_axis"]])),
# # "allColorOptions" = unique(params[["colorPlotsBy"]]),
# # "allColorOptions_names" = unique(names(params[["colorPlotsBy"]])),
# # "splitOptions" = unique(params[["splitPlotsBy"]]),
# # "splitOptions_names" = unique(names(params[["splitPlotsBy"]])),
# # "sampleColorList" = unique(params[["sampleColorList"]]),
# # "sampleColorList_names" = unique(names(params[["sampleColorList"]]))
# # )
# #
# # FN <- paste0("parameters.yaml")
# # yaml::write_yaml(yaml::as.yaml(yamlList), FN, fileEncoding = "UTF-8")
# # setwd("../")
# #
# # if (!dir.exists("Rsc")){
# # dir.create("Rsc")
# # }
# #
# # if (!dir.exists("R")){
# # dir.create("R")
# # }
# #
# # if (!dir.exists("instSC")){
# # dir.create("instSC")
# # }
# #
# # if (!dir.exists("inst")){
# # dir.create("inst")
# # }
# #
# # if (!dir.exists("manSC")){
# # dir.create("manSC")
# # }
# #
# # if (!dir.exists("man")){
# # dir.create("man")
# # }
# #
# # res <- file.copy(system.file("app.r", package = "biologicSC"), ".")
# # res <- file.copy(system.file("NAMESPACE", package = "biologicSC"), ".")
# # res <- file.copy(system.file("DESCRIPTION", package = "biologicSC"), ".")
# #
# # res <- file.copy(system.file("Rsc", package = "biologicSC"), "Rsc", recursive = T)
# # files <- list.files("Rsc/Rsc")
# # res <- file.copy(paste0("Rsc/Rsc/", files), "R")
# #
# # res <- file.copy("Rsc", "R", recursive = T)
# #
# # res <- file.copy(system.file("instSC", package = "biologicSC"), "instSC", recursive = T)
# # files <- list.files("instSC/instSC")
# # res <- file.copy(paste0("instSC/instSC/", files), "inst")
# #
# # res <- file.copy(system.file("manSC", package = "biologicSC"), "manSC", recursive = T)
# # files <- list.files("manSC/manSC")
# # res <- file.copy(paste0("manSC/manSC/", files), "man")
#
# }
# )
###############################################################################
###############################################################################
## Create user and assign privileges ##
#' @title assignDbUsersAndPrivileges
#'
#' @description This function allows you to express your love for the superior furry animal.
#' @param agree Do you agree dogs are the best pet? Defaults to TRUE.
#' @keywords database utils
#' @export
#' @import DBI RMySQL
#' @import methods
# Maximum length for maria db username is 16 #
assignDbUsersAndPrivileges <- function(
accessFilePath = shinyDataPath,
hostDbUrl = "10.27.241.82",
appUserName = substr(paste0(project_id, "_aUser"), 1, 15),
geneDefault = NULL,
domains = c("shiny-bioinformatics.crick.ac.uk", "10.%"),
dbname = "prim.data.db",
tables = c("coordTb" = PCAdbTableName,"exprTb" = expDbTable,"geneTb" = geneTb),
recreateProjectUser = TRUE,
dbAdminUser = "boeings",
dbAdminPwd = "db.pwd",
dataMode = "MySQL"
) {
## Maximum length for app user name is 16
appUserName <- substr(appUserName, 1, 15)
############################
## Helper function
doQuery <- function(
user = "db.user",
password = "db.upload.pwd",
host = "host",
dbname = "primDataDB",
query = "mysql db query",
#existingAccessFileName = existingAccessFileName
resOut = FALSE,
geneDefault = NULL
){
library(RMySQL)
dbDB <- dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname = dbname
)
tryCatch(res <- DBI::dbGetQuery(dbDB, query), error = function(c) {
c$message <- stop(paste0("Error in ", query, "."))
})
DBI::dbDisconnect(dbDB)
if (resOut){
return(res)
}
}
if (file.exists(paste0(accessFilePath, "db.txt"))){
df <- read.delim(paste0(accessFilePath, "db.txt"), header = T, sep="\t", stringsAsFactors = F)
sPwd <- as.vector(df$id2)
sUser <- as.vector(df$id)
} else {
sUser <- substr(appUserName, 1, 30)
sPwd <-c(2:9,letters,LETTERS)
sPwd <- paste(sample(sPwd,8),collapse="")
}
if (!file.exists(paste0(accessFilePath, "db.txt"))){
## Create user in db
for (k in 1:length(domains)) {
query0 <- paste0("SELECT User, Host FROM mysql.user WHERE User = '",sUser,"' AND Host = '",domains[k],"';")
res <- doQuery(
user = dbAdminUser,
password = dbAdminPwd,
host = hostDbUrl,
dbname = dbname,
query = query0,
#existingAccessFileName = existingAccessFileName
resOut = TRUE
)
if (nrow(res) > 0){
query0a <- paste0("DROP USER '",sUser,"'@'",domains[k],"';")
#doQuery(Obio, query = query0a)
doQuery(
user = dbAdminUser,
password = dbAdminPwd,
host = hostDbUrl,
dbname = dbname,
query = query0a,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
}
query1 <- paste0(
"CREATE USER '",
sUser,
"'@'",domains[k],"' IDENTIFIED BY '",
sPwd,
"';"
)
doQuery(
user = dbAdminUser,
password = dbAdminPwd,
host = hostDbUrl,
dbname = dbname,
query = query1,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
} # End for k-loop user fix
## Make password file
## Create log-in file ##
dfID <- data.frame(
type = "main",
dataMode = dataMode,
url = hostDbUrl,
id = sUser,
id2 = sPwd,
db = dbname,
coordTb = tables["coordTb"],
exprTb = tables["exprTb"],
geneTb = tables["geneTb"],
default = geneDefault
)
if (!file.exists(accessFilePath)){
dir.create(accessFilePath, recursive = T)
}
write.table(dfID, paste0(accessFilePath, "db.txt"), row.names = F, sep="\t")
## Done making password file
}
## Done creating users for all domains ##
###########################################################################
###########################################################################
## GRANT access to the app user to all relevant tables ##
for (i in 1:length(tables)){
for (k in 1:length(domains)){
query7 <- paste0(
"GRANT SELECT on ", dbname,".",tables[i], " TO '",sUser,"'@'",domains[k],"';"
)
doQuery(
user = dbAdminUser,
password = dbAdminPwd,
host = hostDbUrl,
dbname = dbname,
query = query7,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
}
}
## Done ##
###########################################################################
}
## End of function assignDbUsersAndPrivileges ##
###############################################################################
###############################################################################
## Upload datatable infile ##
#' @title uploadDbTablInfile
#'
#' @description This function allows you to express your love for the superior furry animal.
#' @param agree Do you agree dogs are the best pet? Defaults to TRUE.
#' @keywords dogs
#' @export
#' @import DBI RMySQL
#' @import methods
uploadDbTableInfile <- function(
host = NULL,
user = NULL,
password = NULL,
prim.data.db = "project.database",
dbTableName = "rnaseqdbTableName",
df.data = "df.data.to.upload",
db.col.parameter.list = list(
"VARCHAR(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("gene_description"),
"VARCHAR(50) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("ENSG", "ENSMUSG", "hgnc_symbol", "mgi_symbol", "uniprot", "entrezgene","display_ptm", "^sequence_window", "p_site_env","for_GSEA_gene_chip","associated_gene_name", "gene_type"),
"VARCHAR(1) CHARACTER SET latin1 COLLATE latin1_swedish_ci" = c("ppos", "amino_acid", "charge","known_site"),
"BIGINT(8) NULL DEFAULT NULL" = c("row_names"),
"INT(8) NULL DEFAULT NULL" = c("row_id", "cluster_order","cluster_id", "count_cut_off", "^position$", "raw_counts"),
"DECIMAL(6,3) NULL DEFAULT NULL" = c("norm_counts", "NES", "logFC", "lg2_avg", "intensity", "^int", "iBAQ","^localization_prob$"),
"DECIMAL(6,5) NULL DEFAULT NULL" = c("padj", "pvalue","^pep$")
),
new.table = TRUE,
cols2Index = NULL,
indexName = NULL,
mode = "MySQL",
tempFileName = "temp.upload.csv"
){
############################
## Helper function
doQuery <- function(
user = "db.user",
password = "db.upload.pwd",
host = "host",
dbname = "primDataDB",
query = "mysql db query",
#existingAccessFileName = existingAccessFileName
resOut = FALSE
){
library(RMySQL)
dbDB <- dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname = dbname
)
tryCatch(res <- DBI::dbGetQuery(dbDB, query), error = function(c) {
c$message <- stop(paste0("Error in ", query, "."))
})
DBI::dbDisconnect(dbDB)
if (resOut){
return(res)
}
}
###########################################################################
## save table locally for in file upload ##
write.csv(df.data, tempFileName, row.names = F)
rm(df.data)
## ##
###########################################################################
## Drop existing table if exists
query1 <- paste0("DROP TABLE IF EXISTS ", dbTableName, ";\n")
doQuery(
user = user,
password = password,
host = host,
dbname = prim.data.db,
query = query1,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
query2 <- paste0(
#query,
"CREATE TABLE IF NOT EXISTS ",
dbTableName,
" (gene VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci, cellID VARCHAR(100) CHARACTER SET latin1 COLLATE latin1_swedish_ci, lg10Expr DECIMAL(6,3) NULL DEFAULT NULL, row_names INT(10) NOT NULL AUTO_INCREMENT,PRIMARY KEY (row_names)); "
)
doQuery(
user = user,
password = password,
host = host,
dbname = prim.data.db,
query = query2,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
print("Data is being rendered. This may take a few minutes for larger datasets.")
## infile upload
query3 <- paste0(
#query,
"LOAD DATA LOCAL INFILE '",
tempFileName,
"' INTO TABLE ",
dbTableName,
" FIELDS TERMINATED BY ',' ENCLOSED BY '\"' LINES TERMINATED BY '\n' IGNORE 1 LINES (gene, cellID, lg10Expr, row_names);"
)
doQuery(
user = user,
password = password,
host = host,
dbname = prim.data.db,
query = query3,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
## alter and index
query4 <- paste0(
#query,
"ALTER TABLE ", dbTableName, " ADD UNIQUE(row_names);"
)
doQuery(
user = user,
password = password,
host = host,
dbname = prim.data.db,
query = query4,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
query5 <- paste0(
#query,
"CREATE INDEX idx_gene ON ", dbTableName, " (gene);"
)
doQuery(
user = user,
password = password,
host = host,
dbname = prim.data.db,
query = query5,
#existingAccessFileName = existingAccessFileName
resOut = FALSE
)
unlink(tempFileName)
print("Data loaded infile.")
}
## Done ##
###############################################################################
###############################################################################
## Function createPowerpointPresentation ##
#' @title doQuery
#'
#'
#' @param user
#' @param password
#' @param host
#' @param dbname
#' @param query
#' @param resOut
#' @param geneDefault
#' @param This can be MySQL or SQLite
#' @import RMySQL
#' @import DBI
#' @import RSQLite
#' @return Res
#' @export
doQuery <- function(
user = "db.user",
password = "db.upload.pwd",
host = "host",
dbname = "primDataDB",
query = "mysql db query",
#existingAccessFileName = existingAccessFileName
resOut = FALSE,
geneDefault = NULL,
mode = "MySQL"
){
#library(RMySQL)
if (mode == "SQLite"){
dbDB <- DBI::dbConnect(
drv = RSQLite::SQLite(),
dbname = dbname
)
} else {
dbDB <- DBI::dbConnect(
drv = RMySQL::MySQL(),
user = user,
password = password,
host = host,
dbname = dbname
)
}
tryCatch(res <- DBI::dbGetQuery(dbDB, query), error = function(c) {
c$message <- stop(paste0("Error in ", query, "."))
})
DBI::dbDisconnect(dbDB)
if (resOut){
return(res)
}
}
## ##
###############################################################################
###############################################################################
## Create Exel Workbook ##
#' @title createExcelWorkbook
#'
#' @description This function creates an Excel workbook from a list with
#' data.frames. Each item of the list will become a sheet in the Excel
#' output file
#' @param excelList A list of dataframes
#' @param outPutFN A output filepath/filename
#' @import openxlsx
#' @export
#'
createExcelWorkbook <- function(
excelList,
outPutFN = "result.table.xlsx"
){
## Check if all names are less than 32 characters ##
names(excelList) <- sapply(names(excelList), function(x) substr(x, 1, 32))
## Make sure all entries are unique - particularly after shortening ##
if (length(unique(names(excelList))) != length(names(excelList))){
names(excelList) <- sapply(names(excelList), function(x) substr(names(excelList),1, 29))
names(excelList) <- paste(names(excelList), "_", 1:lenght(names(excelList)))
}
wb <- openxlsx::createWorkbook()
## Style headers ##
hs1 <- openxlsx::createStyle(
fontColour = "#ffffff",
fgFill = "#000000",
halign = "CENTER",
textDecoration = "Bold"
)
for (i in 1:length(excelList)){
sheet <- names(excelList)[i]
dfOutput <- data.frame(excelList[[i]])
openxlsx::addWorksheet(wb, sheet)
openxlsx::freezePane(wb, sheet , firstActiveRow = 2)
openxlsx::writeData(wb, sheet = sheet, dfOutput, startRow = 1, startCol = 1, headerStyle = hs1)
}
openxlsx::saveWorkbook(
wb,
outPutFN ,
overwrite = TRUE
)
print(paste0("Excel output files create and depoisted in ", outPutFN))
}
## End: Create Excel Workbook ##
###############################################################################
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.