R/merge_footprints.R

Defines functions getfasta2 getfasta merge_footprints get_merged_fasta

Documented in get_merged_fasta

#' merge footprints and return corresponding sequence-
#' @description This function first merges footprints whose distances is less
#' than 4, and then obtains the sequence of each footprint based on the reference genome,
#' and  output the sequence in FASTA format
#'
#' @param footprints footprints file, generated by dnase2tf or Hint or other
#' related software, name of first column should be 'chr', name of second column
#' should be 'start' and name of thrid column should be 'end'.
#' @param fastadir character, indiating the path of reference genome
#' @param distance numeric, indicating the cutoff for merging footprints
#'
#' @return return sequence of footprints in fasta format
#' @export
#' @importFrom Biostrings readBStringSet
#' @examples fastadir='Genome/GRCm38Chr.fasta'
#' #merged_fasta <- get_merged_fasta(fdr005,fastadir)
get_merged_fasta <- function(footprints, fastadir, distance = 4) {
  validInput(footprints,'footprints','df')
  validInput(fastadir,'fastadir','direxists')
  validInput(distance,'distance','numeric')
  merged_footprints <- merge_footprints(footprints, ditance = distance)
  fasta <- getfasta(merged_footprints, fastadir = fastadir)
  return(fasta)
}


merge_footprints <- function(footprints, ditance = 4, revise = TRUE) {
  con1 <- footprints
  str1 <- con1[1,1]
  start1 <- con1[1,2]
  end1 <- con1[1,3]
  pva1 <- con1[1,4]
  no1 <- 1
  out1 <- c()
  for (i in 2:nrow(con1)) {
    str2 <- as.character(con1[i, ][1])
    start2 <- as.numeric(con1[i, ][2])
    end2 <- as.numeric(con1[i, ][3])
    pva2 <- as.numeric(con1[i, ][4])
    if (str2 == str1 & (start1 >= start2 & start1 <= end2 | end1 >= start2 &
                        end1 <= end2 | start1 <= start2 & end1 >= end2 | start2 >
                        end1 & (start2 - end1) <= ditance | start1 > end2 &
                        (start1 - end2) <= ditance)) {
      acc22 <- sort(c(start1, start2, end1, end2))
      start1 <- acc22[1]
      end1 <- acc22[4]
      pva1 <- pva1 + pva2
      no1 <- no1 + 1
    } else {
      pva2 <- pva1 / no1
      col1 <- paste(str1, start1, end1, pva2, sep = "\t")
      out1 <- c(out1, col1)
      str1 <- str2
      start1 <- start2
      end1 <- end2
      pva1 <- as.numeric(con1[i, ][4])
      no1 <- 1
    }
  }
  out2 <- as.data.frame(t(as.data.frame(strsplit(out1, "\t"))))
  colnames(out2) <- c("chr", "strat", "stop", "pvalue")
  out2[, 2] <- as.numeric(out2[, 2])
  out2[, 3] <- as.numeric(out2[, 3])
  if (revise == TRUE) {
    out2[, 2] <- out2[, 2] - 2
    out2[, 3] <- out2[, 3] + 2
    return(out2)
  } else {
    return(out2)
  }
}


getfasta <- function(merged_footprints, fastadir) {
  fasta <- Biostrings::readBStringSet(fastadir, format = "fasta", nrec = -1L,
                                      skip = 0L, seek.first.rec = FALSE,
                                      use.names = TRUE)
  fasta1 <- c()
  for (i in 1:nrow(merged_footprints)) {
    name <- paste0(">", merged_footprints[i, 1], ":", merged_footprints[i, 2],
                   "-", merged_footprints[i, 3])
    if (merged_footprints[i, 3] > length(fasta[[merged_footprints[i, 1]]])) {
      sequence <- toupper(as.character(fasta[[merged_footprints[i, 1]]][
        (merged_footprints[i, 2] + 1):length(fasta[[merged_footprints[i, 1]]])]))
      name <- paste0(">", merged_footprints[i, 1], ":", merged_footprints[i, 2],
                     "-", length(fasta[[merged_footprints[i, 1]]]))
    } else{
      sequence <- toupper(as.character(fasta[[merged_footprints[i, 1]]][
        (merged_footprints[i, 2] + 1):merged_footprints[i, 3]]))
    }
    fasta1 <- c(fasta1, name, sequence)
  }
  fasta1 <- as.data.frame(fasta1)
  return(fasta1)
}

getfasta2 <- function(merged_footprints, fasta) {
  fasta1 <- c()
  for (i in 1:nrow(merged_footprints)) {
    name <- paste0(">", merged_footprints[i, 1], ":", merged_footprints[i, 2],
                   "-", merged_footprints[i, 3])
    if (merged_footprints[i, 3] > length(fasta[[merged_footprints[i, 1]]])) {
      sequence <- toupper(as.character(fasta[[merged_footprints[i, 1]]][
        (merged_footprints[i, 2] + 1):length(fasta[[merged_footprints[i, 1]]])]))
      name <- paste0(">", merged_footprints[i, 1], ":", merged_footprints[i, 2],
                     "-", length(fasta[[merged_footprints[i, 1]]]))
    } else{
      sequence <- toupper(as.character(fasta[[merged_footprints[i, 1]]][
        (merged_footprints[i, 2] + 1):merged_footprints[i, 3]]))
    }
    fasta1 <- c(fasta1, name, sequence)
  }
  fasta1 <- as.data.frame(fasta1)
  return(fasta1)
}
jiang-junyao/IReNA documentation built on Nov. 4, 2024, 8:29 p.m.