#' Get tss region sequence of target genes in regulaotry relationships
#'
#' @param gtf Gene transfer format, you can download it from http://www.ensembl.org/info/data/ftp/index.html
#' @param gene.use character, indicating target genes
#' @param upstream_length numeric, indicating upstream region length of TSS
#' @param downstream_length numeric, indicating downstream region length of TSS
#'
#' @return data.frame, contains four columns are target gene, chr, start and end.
#' @export
#'
get_tss_region <- function(gtf,gene.use,upstream_length=1000,downstream_length=500){
validInput(gtf,'gtf','df')
validInput(gene.use,'gene.use','character')
validInput(upstream_length,'upstream_length','numeric')
validInput(downstream_length,'downstream_length','numeric')
gtfgene=gtf[gtf$V3=='gene',]
genes <- apply(gtfgene, 1, extract_genes)
gtfgene$genes <- genes
final <- gtfgene[gtfgene$genes%in%gene.use,]
chr1 <- gtfgene[gtfgene$genes%in%gene.use,]$V1
start1 <- c()
end1 <- c()
for (i in 1:nrow(final)) {
if (final[i,7]=='-') {
start1 <- c(start1,as.numeric(final[i,5])-upstream_length)
end1 <- c(end1,as.numeric(final[i,5])+downstream_length)
}else{
start1 <- c(start1,as.numeric(final[i,4])-upstream_length)
end1 <- c(end1,as.numeric(final[i,4])+downstream_length)
}
}
final$chr <- chr1
final$start <- start1
final$end <- end1
final <- final[,(ncol(final)-3):ncol(final)]
return(final)
}
extract_genes <- function(gtf){
id1 <- strsplit(gtf[9],';')[[1]][1]
id2 <- strsplit(id1,' ')[[1]][2]
return(id2)
}
#' Use fimo to scan the TSS regions of candidate genes to find binding motifs
#'
#' @param gene_tss TSS region of genes. first column should be gene, second column
#' should be chr, third column should be start, fourth column should be end.
#' generated by \code{\link{get_tss_region}}
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @param refdir character, indicating the path of reference genome. Reference
#' genome can be download from https://hgdownload.soe.ucsc.edu/downloads.html
#' @param fimodir character, indicating path of fimo software,
#' if you have added fimo to the environment variable, just set this argument as 'fimo'
#' @param outputdir1 character, indicating the output path of the shell scripts
#' and sequence of target genes tss regions.(function 'find_motifs_targetgenes'
#' will automatically generate two folders ('fasta' and 'fimo') in the path
#' 'outputdir1', and store sequence of target genes tss regions in the 'fasta'
#' and shell scripts in the 'fimo')
#' @param Motifdir character, indicating the path of meme motif file
#' @param sequencedir character, indicating the path of sequence of target genes
#' tss regions. If it's NULL, this parameter will be paste0(outputdir1,'fasta/')
#' @param select_motif logic, indicating whether to select motifs whose related
#' transcription factors are in genes of gene_tss
#' @param use_nohup logic, indicating whether use nohup to run all fimo scripts simultaneously
#' @importFrom stringr str_ends
#' @return
#' @export
#'
#' @examples
find_motifs_targetgenes <- function(gene_tss,motif,refdir,fimodir,outputdir1, Motifdir
, sequencedir = NULL,select_motif = T, use_nohup = F){
validInput(refdir,'refdir','fileexists')
validInput(Motifdir,'Motifdir','direxists')
validInput(outputdir1,'outputdir','direxists')
if (str_ends(outputdir1,'/')==FALSE) {
warning('the last character of outputdir1 is not "/"')
}
fasta <- Biostrings::readBStringSet(refdir, format = "fasta", nrec = -1L,
skip = 0L, seek.first.rec = FALSE,
use.names = TRUE)
gene_tss[,3] <- as.integer(gene_tss[,3])
gene_tss[,4] <- as.integer(gene_tss[,4])
if (select_motif==T) {
motif1 <- motifs_select(motif,gene_tss[,1])
}else{motif1 = motif}
outputdir <- paste0(outputdir1,'fimo/')
if (is.null(sequencedir)) {
sequencedir <- paste0(outputdir1,'fasta/')
}
fimoall <- c()
if (!'fasta' %in% dir(outputdir1)) {
dir.create(paste0(outputdir1,'fasta'))
}
if (!'fimo' %in% dir(outputdir1)) {
dir.create(paste0(outputdir1,'fimo'))
}
for (i in 1:nrow(gene_tss)) {
if (use_nohup==TRUE) {
fimo1 <- paste0('nohup sh ',outputdir1,'fimo/',gene_tss[i,1],'/','Fimo1.sh &')
}else if(use_nohup==FALSE){
fimo1 <- paste0('sh ',outputdir1,'fimo/',gene_tss[i,1],'/','Fimo1.sh ;')
}else{stop('parameter use_nohup should be TRUE or FALSE')}
fimoall <- c(fimoall,fimo1)
dir.create(paste0(outputdir1,'fimo/',gene_tss[i,1]))
fasta1 <- getfasta2(gene_tss[i,2:4],fasta)
write.table(fasta1,paste0(outputdir1,'fasta/',gene_tss[i,1],'.fa')
,col.names=F,row.names=F,quote=F)
fimodir1 <- fimodir
outputdir12 <- paste0(outputdir1,'fimo/',gene_tss[i,1],'/')
outputdir11 <- paste0(outputdir,gene_tss[i,1],'/')
sequencedir1 <- paste0(sequencedir,gene_tss[i,1],'.fa')
find_motifs(motif1,step=500,fimodir1, outputdir12, outputdir11, Motifdir,
sequencedir1)
}
fimoall <- as.data.frame(fimoall)
write.table(fimoall,paste0(outputdir1,'fimo/','fimoall.sh'),
quote = F,row.names = F,col.names = F)
}
#' Generate regulation database accoding to the fimo result
#'
#' @param motif_dir character, indicating the path of fimo result
#' @param motif motif file, you can choose our bulit-in motif database of
#' 'mus musculus', 'homo sapiens', 'zebrafish' and 'chicken' by 'motif = Tranfac201803_Mm_MotifTFsF',
#' 'motif = Tranfac201803_Hs_MotifTFsF', 'motif = Tranfac201803_Zf_MotifTFsF',
#' 'motif = Tranfac201803_Ch_MotifTFsF' respectively, or you can upload your own
#' motif data base, but the formata use be the same as our built-in motif database.
#' @importFrom rlang is_empty
#' @return
#' @export
#'
#' @examples
generate_fimo_regulation <- function(motif_dir,motif){
validInput(motif_dir,'motif_dir','direxists')
validInput(motif,'motif','df')
targetgenes <- dir(motif_dir)
for (i in 1:length(targetgenes)) {
DirFile1 <- list.files(paste0(motif_dir,targetgenes[i]))
if (!rlang::is_empty(DirFile1)) {
info <- file.info(paste0(motif_dir,targetgenes[i],'/',DirFile1))
rownames(info) <- DirFile1
info <- info[info$size>0,]
info <- info[!grepl(c('Fimo_All.sh'),rownames(info)),]
info <- info[!grepl(c('Fimo..sh'),rownames(info)),]
info <- info[!grepl(c('Fimo...sh'),rownames(info)),]
if (nrow(info)>0) {
motif1 <- rownames(info)
motif1 <- gsub('.txt','',motif1)
motif2 <- motif[motif$Accession %in% motif1,]
motifgene <- apply(motif2, 1, split_motif)
motifgene <- unlist(motifgene)
motifgene1 <- paste(motifgene,collapse = ';')
if (exists('regulation_final') == FALSE) {
regulation_final <- data.frame(motifgene1,targetgenes[i])
colnames(regulation_final) <- c('TF','Target')
}else{
regulation1 <- data.frame(motifgene1,targetgenes[i])
rownames(regulation1) <- i
colnames(regulation1) <- c('TF','Target')
regulation_final <- rbind(regulation_final,regulation1)
}
}
}
}
return(regulation_final)
}
#' Filter regulatory relationships according to binding motifs generated by fimo
#' @description If ATAC-seq data is not available, binding motifs of target genes
#' can be used to filter regulatory relationships.
#' @param fimo_regulation regulation database, you can download it from xxx, or
#' generate it by \code{\link{generate_fimo_regulation}}
#' @param regulatory_relationships unfiltered regulatory_relationships, generated by
#' \code{\link{get_cor}}
#'
#' @return
#' @export
#'
#' @examples
filter_regulation_fimo <- function(fimo_regulation,regulatory_relationships){
if (!'TF' %in% colnames(regulatory_relationships)) {
stop('regulatory_relationships should contain "TF" column')
}
if (!'Target' %in% colnames(regulatory_relationships)) {
stop('regulatory_relationships should contain "Target" column')
}
fimo_pair <- apply(fimo_regulation, 1, function(x1){
TFs <- strsplit(x1[1],';')[[1]]
Target <- x1[2]
regulation <- paste(TFs,Target)
return(regulation)
})
fimo_pair <- unlist(fimo_pair)
regulation_pair <- paste(regulatory_relationships[,'TF'],regulatory_relationships[,'Target'])
regulation1 <- regulatory_relationships[regulation_pair %in% fimo_pair,]
return(regulation1)
}
split_motif <- function(motif){
gene1 <- strsplit(motif[5],';')[[1]]
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.