R/IO-methods.R

Defines functions readCNERangesFromSQLite saveCNEToSQLite read.rmskFasta read.rmMask.GRanges writeAxt axtInfo readAxt readBed seqinfoFn

Documented in axtInfo readAxt readBed readCNERangesFromSQLite read.rmMask.GRanges read.rmskFasta saveCNEToSQLite writeAxt

### -----------------------------------------------------------------
### seqinfoFn: get the Seqinfo object from fasta or twoBit file.
### Not Exported!
seqinfoFn <- function(fn){
  fileType <- file_ext(fn)
  genome <- sub("\\..*$", "", basename(fn))
  if(fileType %in% c("fa", "fasta")){
    # fasta file
    seqlengths1 <- fasta.seqlengths(fn)
    names(seqlengths1) <- sapply(strsplit(names(seqlengths1), " "), "[", 1)
    ## Only keep the first part of fasta lines.
    ans <- Seqinfo(seqnames=names(seqlengths1),
                   seqlengths=seqlengths1,
                   genome=genome)
  }else if(fileType == "2bit"){
    # 2bit file
    ans <- seqinfo(TwoBitFile(fn))
  }
  genome(ans) <- genome
  return(ans)
}

### -----------------------------------------------------------------
### readBed: read the bed file into GRanges.
### Exported!
readBed <- function(bedFile, assemblyFn=NULL){
  ## GRanges: 1-based start
  ## bed file: 0-based start
  
  #bed <- .Call2("myReadBed", bedFile, PACKAGE="CNEr")
  bed <- suppressMessages(read_tsv(bedFile, col_names=FALSE, comment="track"))
  ## We only need the first three columns of the bed file, 
  ## but keep the strand information when available
  if(ncol(bed) == 3L){
    strands <- factor("+")
  }else{
    strands <- bed[[6]]
  }
  seqinfoBed <- NULL
  if(!is.null(assemblyFn)){
    seqinfoBed <- seqinfoFn(assemblyFn)
  }
  ans <- GRanges(seqnames=Rle(bed[[1]]),
                 ranges=IRanges(start=bed[[2]]+1L, end=bed[[3]]),
                 strand=strands, seqinfo=seqinfoBed)
  return(ans)
}

### -----------------------------------------------------------------
### read the axt files into an axt object.
### Exported!
readAxt <- function(axtFiles, tAssemblyFn=NULL, qAssemblyFn=NULL){
  if(length(unique((file_ext(axtFiles)))) > 1)
    stop("`axtFiles` must have same extensions!")
  
  # Read axt files into R axt object.
  # The coordinates are 1-based for start and end.
  index_noexists <- !file.exists(axtFiles)
  if(any(index_noexists)){
    stop("No such file ", paste(axtFiles[index_noexists], sep=" "))
  }
  
  if(.Platform$OS.type == "windows"){
    ## The code on Windows platform cannot deal with gzipped axt file
    ## We need to ungzip it first and gzip it back later
    if(any(file_ext(axtFiles) == "gz")){
      axtFiles <- sapply(axtFiles, gunzip)
      on.exit(sapply(axtFiles, gzip))
    }
  }
  
  # Prepare the seqinfo when available
  seqinfoTarget <- NULL
  if(!is.null(tAssemblyFn)){
    seqinfoTarget <- seqinfoFn(tAssemblyFn)
  }
  seqinfoQuery <- NULL
  if(!is.null(qAssemblyFn)){
    seqinfoQuery <- seqinfoFn(qAssemblyFn)
  }
  
  ## Extend the absolute paths of files
  axtFiles <- normalizePath(axtFiles)
  myAxt <- .Call2("myReadAxt", axtFiles, PACKAGE="CNEr")
  axts <- Axt(targetRanges=GRanges(seqnames=myAxt[[1]],
                                   ranges=IRanges(start=myAxt[[2]],
                                                  end=myAxt[[3]]),
                                   strand=myAxt[[4]],
                                   seqinfo=seqinfoTarget),
              targetSeqs=DNAStringSet(myAxt[[5]]),
              queryRanges=GRanges(seqnames=myAxt[[6]],
                                  ranges=IRanges(start=myAxt[[7]],
                                                 end=myAxt[[8]]),
                                  strand=myAxt[[9]],
                                  seqinfo=seqinfoQuery),
              querySeqs=DNAStringSet(myAxt[[10]]),
              score=myAxt[[11]],
              symCount=myAxt[[12]]
            )
  return(axts)
}

### -----------------------------------------------------------------
### read the axt files and return the widths of all the alignments
### Exported!
axtInfo <- function(axtFiles){
  index_noexists <- !file.exists(axtFiles)
  if(any(index_noexists)){
    stop("No such file ", paste(axtFiles[index_noexists], sep=" "))
  }
  ans <- .Call2("axt_info", axtFiles, PACKAGE="CNEr")
  return(ans)
}

### -----------------------------------------------------------------
### write the Axt object to an axt file
### Exported!
writeAxt <- function(axt, con){
  firstLine <- paste(0:(length(axt)-1), seqnames(targetRanges(axt)),
                     start(targetRanges(axt)), end(targetRanges(axt)),
                     seqnames(queryRanges(axt)),
                     start(queryRanges(axt)), end(queryRanges(axt)),
                     strand(queryRanges(axt)), score(axt)
                     )
  secondLine <- targetSeqs(axt)
  thirdLine <- querySeqs(axt)
  wholeLines <- paste(firstLine, as.character(targetSeqs(axt)), 
                      as.character(querySeqs(axt)),
                      "", sep="\n")
  writeLines(wholeLines, con)
}

### -----------------------------------------------------------------
### read RepeatMasker out file into a GRanges object
### Exported!
read.rmMask.GRanges <- function(fn){
  rmMaskOut <- read.table(fn, header=FALSE, sep="", skip=3, as.is=TRUE,
                          col.names=1:16, fill=TRUE)
  rmMaskGRanges <- GRanges(seqnames=rmMaskOut$X5,
                           ranges=IRanges(start=rmMaskOut$X6,
                                          end=rmMaskOut$X7),
                           strand=ifelse(rmMaskOut$X9=="+", "+", "-"),
                           name=rmMaskOut$X10,
                           type=rmMaskOut$X11,
                           score=rmMaskOut$X1)
  return(rmMaskGRanges)
}

### -----------------------------------------------------------------
### read a soft-repeatMasked fasta (repeats in lower case) and 
### get the repeats regions
### Exported!
read.rmskFasta <- function(fn){
  seq <- readBStringSet(fn)
  names(seq) <- sapply(strsplit(names(seq), " "), "[", 1)
  foo3 <- lapply(lapply(strsplit(as.character(seq),"") ,"%in%",
                       c("a","c","g","t")), Rle)
  foo4 <- lapply(foo3, as, "IRanges")
  foo5 <- GRanges(seqnames=Rle(names(foo4), lengths(foo4)),
                  ranges=IRanges(start=unlist(sapply(foo4, start)),
                                 end=unlist(sapply(foo4, end))),
                  strand="+")
  return(foo5)
}

### -----------------------------------------------------------------
### save the CNE class or GRangePairs object into a local SQLite database
### Exported!!
saveCNEToSQLite <- function(x, dbName, tableName=NULL, overwrite=FALSE){
  ## by default tableName is in the format "danRer7_hg19_49_50"
  if(is.null(tableName)){
    tableName <- paste(sub("\\.2bit", "", basename(x@assembly1Fn)),
                       sub("\\.2bit", "", basename(x@assembly2Fn)),
                       x@identity, x@window, sep="_")
  }
  if(class(x) == "CNE"){
    ## CNE class
    cneFinal <- CNEFinal(x)
  }else if(class(x) == "GRangePairs"){
    cneFinal <- x
  }else{
    stop(" x must be a CNE class or GRangePairs class.")
  }
  
  firstCNE <- as.data.frame(first(cneFinal))
  colnames(firstCNE) <- paste0("first.", colnames(firstCNE))
  secondCNE <- as.data.frame(second(cneFinal))
  colnames(secondCNE) <- paste0("second.", colnames(secondCNE))
  cneFinal <- cbind(firstCNE, secondCNE)
  
  if(nrow(cneFinal) == 0L){
    warning("There is no CNEs.")
  }
  
  ## Create the bin column
  cneFinal$first.bin <- binFromCoordRange(cneFinal$first.start,
                                          cneFinal$first.end)
  cneFinal$second.bin <- binFromCoordRange(cneFinal$second.start,
                                         cneFinal$second.end)
  cneFinal <- cneFinal[ ,c("first.bin", "first.seqnames",
                           "first.start", "first.end",
                           "second.bin",
                           "second.seqnames", "second.start", "second.end"
  )]
  
  ## SQLite
  con <- dbConnect(SQLite(), dbname=dbName)
  on.exit(dbDisconnect(con))
  dbWriteTable(con, tableName, cneFinal, row.names=FALSE,
               overwrite=overwrite)
  invisible(tableName)
}

### -----------------------------------------------------------------
### read CNE from a local SQLite database
### Exported!
readCNERangesFromSQLite <- function(dbName, tableName,
                                    chr=NULL, start=NULL, end=NULL,
                                    whichAssembly=c("first", "second"),
                                    minLength=NULL,
                                    tAssemblyFn=NULL, qAssemblyFn=NULL){
  nrGraphs <- 1
  ## Let's make nrGraphs=1, make all the cnes together.
  whichAssembly <- match.arg(whichAssembly)
  con <- dbConnect(SQLite(), dbname=dbName)
  on.exit(dbDisconnect(con))
  
  if(is.null(chr) && is.null(start) & is.null(end)){
    # 1. fetch all the CNEs: chr=NULL, start=NULL, end=NULl
    sqlCmd <- paste("SELECT [first.seqnames],[first.start],[first.end],[second.seqnames],[second.start],[second.end] from",
                    tableName)
  }else if(!is.null(chr) && is.null(start) && is.null(end)){
    # 2. fetch all CNEs on chromosomes chr
    sqlCmd <- switch(whichAssembly,
      "first"=paste("SELECT [first.seqnames],[first.start],[first.end],[second.seqnames],[second.start],[second.end] from",
                    tableName, "WHERE [first.seqnames] IN (",
                    paste(paste0("'", chr, "'"), collapse=","),
                    ")"),
      "second"=paste("SELECT [first.seqnames],[first.start],[first.end],[second.seqnames],[second.start],[second.end] from",
                   tableName, "WHERE [second.seqnames] IN (",
                   paste(paste0("'", chr, "'"), collapse=","),
                   ")")
    )
  }else if(!is.null(chr) && !is.null(start) && !is.null(end)){
    # 3. fetch all CNEs on potentially multiple chr, start, end
    CNEstart <- as.integer(start)
    CNEend <- as.integer(end)
    sqlCmd <- switch(whichAssembly,
      "first"=paste("SELECT [first.seqnames],[first.start],[first.end],[second.seqnames],[second.start],[second.end] from",
                    tableName, "WHERE", paste("([first.seqnames]=",
                                              paste0("'", chr, "'"),
                                              "AND [first.start] >=", CNEstart,
                                              "AND [first.end] <=",
                                              CNEend, "AND",
                      binRestrictionString(CNEstart, CNEend,
                                           "[first.bin]"),
                      ")", collapse=" OR ")),
      "second"=paste("SELECT [first.seqnames],[first.start],[first.end],[second.seqnames],[second.start],[second.end] from",
                   tableName, "WHERE", paste("([second.seqnames]=",
                                             paste0("'", chr, "'"),
                                             "AND [second.start] >=", CNEstart,
                                             "AND [second.end] <=",
                                             CNEend, "AND",
                     binRestrictionString(CNEstart, CNEend,
                                          "[second.bin]"),
                     ")", collapse=" OR "))
      )
  }else{
    stop("Unsupported search criteria!")
  }
  if(!is.null(minLength))
    sqlCmd <- paste(sqlCmd, "AND [first.end]-[first.start]+1 >=", minLength, 
                    "AND [second.end]-[second.start]+1 >=", minLength)
  fetchedCNE <- dbGetQuery(con, sqlCmd)
  
  # Prepare the seqinfo when available
  seqinfoTarget <- NULL
  if(!is.null(tAssemblyFn)){
    seqinfoTarget <- seqinfoFn(tAssemblyFn)
  }
  seqinfoQuery <- NULL
  if(!is.null(qAssemblyFn)){
    seqinfoQuery <- seqinfoFn(qAssemblyFn)
  }
  
  # Return empty GRangePairs when no CNEs are returned.
  if(nrow(fetchedCNE) == 0L){
    return(GRangePairs(first=GRanges(seqinfo=seqinfoTarget),
                       second=GRanges(seqinfo=seqinfoQuery)))
  }
  
  firstGRanges <- GRanges(seqnames=fetchedCNE[ ,1],
                          ranges=IRanges(start=fetchedCNE[ ,2],
                                         end=fetchedCNE[,3]),
                          strand="*", seqinfo=seqinfoTarget)
  lastGRanges <- GRanges(seqnames=fetchedCNE[ ,4],
                         ranges=IRanges(start=fetchedCNE[ ,5],
                                        end=fetchedCNE[ ,6]),
                         strand="*", seqinfo=seqinfoQuery)
  ans <- GRangePairs(first=firstGRanges, second=lastGRanges)
  return(ans)
}
ge11232002/CNEr documentation built on Oct. 26, 2022, 7:08 p.m.