R/readSummary.R
In IONiseR: Quality Assessment Tools for Oxford Nanopore MinION data

Documented in readFast5Summary

#' Read summary data from fast5 files.
#' 
#' Reads one or more fast5 files and collects summary information about them.  
#' 
#' Currently this function assumes all files passed to it come from the same 
#' sequencing run.  It makes no effort to check for alternative file names or 
#' the like.  If files from multiple runs are passed to it they will be 
#' collated together and any analysis performed on them will represent the 
#' mixture of both experiments.
#' 
#' @param files Character vector of fast5 files to be read.
#' @return Object of class \linkS4class{Fast5Summary}
#' @examples \dontrun{
#' fast5files <- list.files('/foo/bar/', pattern = '.fast5$')
#' summaryData <- readFast5Summary(fast5files)
#' }
#' @export
#' @importFrom tibble tibble as_tibble
#' @importFrom dplyr filter
#' @importFrom ShortRead append
#' @importFrom utils packageVersion
readFast5Summary <- function(files) {
    
    ## some files can't be opened, so we filter them here
    message("Checking file validity")
    files <- files[file.exists(files)]
    if(length(files) == 0) {
        stop('None of the provided files can be accessed.  ',
             'Have you supplied the correct path?')
    }
    fileStatus <- sapply(files, .checkOpening, USE.NAMES = FALSE)
    files <- files[ which(fileStatus) ]
    
    status <- .fast5status(files = sample(files, size = min(length(files), 15)))
    if(status$loc_template == "") {
        stop("No basecalls for template strand found.  Aborting")
    }
    
    message("Reading Channel Data")
    if(status$read_in_name) {
        readNums <- as.integer(str_match(string = files, pattern = "read[_]?([0-9]+)")[,2])
    } else { 
        readNums <- rep(NA, length(files))
    }
    readInfo <- do.call("rbind", mapply(.getReadChannelMux2, files, readNums, dontCheck = FALSE,
                                        USE.NAMES = FALSE, SIMPLIFY = FALSE))
    readInfo <- as_tibble(cbind(id = 1:nrow(readInfo), file = basename(files), readInfo))
    
    if(status$raw_reads) {
        message("Reading Raw Data")
        ## raw data is the median signal, and the duration of the read
        rawData <- as_tibble(do.call("rbind", mapply(.getRawSummary, files, readNums, dontCheck = TRUE, 
                                                     USE.NAMES = FALSE, SIMPLIFY = FALSE)))
    } 
    
    if(status$event_detection) {
        message("Reading event data")
        eventData <- do.call("rbind", mapply(.getEventsSummary, files, readNums, dontCheck = TRUE, 
                                             USE.NAMES = FALSE, SIMPLIFY = FALSE))
    } else {
        message("Event data not found")
        eventData <- NA
        eventData <- do.call("rbind", mapply(.getRawStartDuration, files, readNums, 
                                             USE.NAMES = FALSE, SIMPLIFY = FALSE))
        colnames(eventData) <- c("start_time", "duration")
    }
    
    if(status$raw_reads) {
        rawEventData <- as_tibble(cbind(id = readInfo[['id']], rawData, eventData))
    } else {
        rawEventData <- as_tibble(cbind(id = readInfo[['id']], eventData))
    }
    
    ## we convert timing data into seconds. 
    ## To do this we find the sampling rate stored in one file
    ## not possible if the event_data wasn't present in the files
    #if(status$event_detection) {
        samplingRate <- .getSamplingRate(files[1])
        rawEventData <- mutate(rawEventData, 
                               start_time = start_time / samplingRate,
                               duration = duration / samplingRate)
    #}
    
    message("Reading Template Data")
    d <- str_match(pattern = "_([12]D)_", string = status$loc_template)[,2]
    template <- do.call("rbind", mapply(.getBaseCalledSummary, files, dontCheck = FALSE, 
                                        strand = "template", d = d,
                                        SIMPLIFY = FALSE, USE.NAMES = FALSE))
    template <- mutate(template, id = readInfo[['id']])
    template <- filter(template, !(is.na(num_events)))
    
    message("Reading Template FASTQ")
    ## get the fastq for those that have it
    fq_t <- mapply(.getFastqString, files[ template[['id']] ], strand = "template", d = d)
    fq_t <- .processFastqVec(fq_t, readIDs = template[['id']], appendID = "_template")  
    fastq_template <- fq_t$fastq
    ## if there are any invalid entries we need to remove them
    if(length(fq_t$invalid)) {
        template <- template[-fq_t$invalid,]
    }
    baseCalled <- template
    
    if(nchar(status$loc_complement)) { 
        message("Reading Complement Data")
        d <- str_match(pattern = "_([12]D)_", string = status$loc_complement)[,2]
        complement <- do.call("rbind", mapply(.getBaseCalledSummary, files, dontCheck = FALSE, 
                                              strand = "complement", d = d,
                                              SIMPLIFY = FALSE, USE.NAMES = FALSE))
        complement <- mutate(complement, id = readInfo[['id']])
        complement <- filter(complement, !(is.na(num_events)))
        
        message("Reading Complement FASTQ")
        fq_c <- mapply(.getFastqString, files[ complement[['id']] ], strand = "complement", d = d)
        fq_c <- .processFastqVec(fq_c, readIDs = complement[['id']], appendID = "_complement")  
        fastq_complement <- fq_c$fastq
        
        ## if there are any invalid entries we need to remove them
        if(length(fq_c$invalid)) {
            complement <- complement[-fq_c$invalid,]
        }
        
    } else {
        fastq_complement <- ShortRead::ShortReadQ()
        complement <- NULL
    }
    
    ## we haven't read anything about 2D reads yet, so we need to identify which
    ## files have them.  Then we'll read only those
    idx2D <- which(sapply(files, .groupExistsString, group = paste0("/Analyses/Basecall_2D_000/BaseCalled_2D")))
    if(length(idx2D)) {
        message("Reading 2D FASTQ")
        fq_2D <- sapply(files[ idx2D ], .getFastqString, strand = "2D", d = "2D")
        fq_2D <- .processFastqVec(fq_2D, readIDs = readInfo[['id']][idx2D], appendID = "_2D")  
        fastq_2D <- fq_2D$fastq
    }
    
    ## We update the individual strands to indicate if they are part of a full 2D read
    template <- as_tibble(cbind(template, full_2D = template[['id']] %in% idx2D))
    complement <- as_tibble(cbind(complement, full_2D = complement[['id']] %in% idx2D))
    
    # ## combine the template, complement and 2D data
    baseCalled <- rbind(template, complement)
    fastq <- ShortRead::append(fastq_template, fastq_complement)
    if(length(idx2D)) {
        fastq <- ShortRead::append(fastq, fastq_2D)
    }
    
    message("Done")
    obj <- new("Fast5Summary", 
               readInfo = readInfo, 
               rawData = tibble(), 
               eventData = rawEventData, 
               baseCalled = baseCalled, 
               fastq = fastq,
               versions = list('IONiseR' = strsplit(as.character(packageVersion("IONiseR")),".",fixed=TRUE)[[1]]
                               # 'MinKNOW' = max(versions))
               )
    )
    
    return(obj)
}

#' @importFrom BiocParallel MulticoreParam register bpmapply
readFast5Summary.mc <- function(files, ncores = 2) {
  
  register(MulticoreParam(workers = ncores))
  
  ## some files can't be opened, so we filter them here
  message("Checking file validity")
  fileStatus <- sapply(files, .checkOpening, USE.NAMES = FALSE)
  files <- files[ which(fileStatus) ]
  
  status <- .fast5status(files = sample(files, size = min(length(files), 15)))
  if(status$loc_template == "") {
    stop("No basecalling for template strand found.  Aborting")
  }
  
  message("Reading Channel Data")
  if(status$read_in_name) {
    readNums <- as.integer(str_match(string = files, pattern = "read[_]?([0-9]+)")[,2])
  } else { 
    readNums <- rep(NA, length(files))
  }
  readInfo <- do.call("rbind", bpmapply(.getReadChannelMux2, files, readNums, dontCheck = TRUE,
                                      USE.NAMES = FALSE, SIMPLIFY = FALSE))
  readInfo <- as_tibble(cbind(id = 1:nrow(readInfo), file = basename(files), readInfo))
  
  if(status$raw_reads) {
    message("Reading Raw Data")
    ## raw data is the median signal, and the duration of the read
    rawData <- as_tibble(do.call("rbind", bpmapply(.getRawSummary, files, readNums, dontCheck = TRUE, 
                                                 USE.NAMES = FALSE, SIMPLIFY = FALSE)))
  } 
  
  message("Reading Event Data")
  eventData <- do.call("rbind", bpmapply(.getEventsSummary,  files, readNums, dontCheck = TRUE, 
                                       USE.NAMES = FALSE, SIMPLIFY = FALSE))
  
  if(status$raw_reads) {
    rawEventData <- as_tibble(cbind(id = readInfo[['id']], rawData, eventData))
  } else {
    rawEventData <- as_tibble(cbind(id = readInfo[['id']], eventData))
  }
  
  ## we convert timing data into seconds. 
  ## To do this we find the sampling rate stored in one file
  samplingRate <- .getSamplingRate(files[1])
  rawEventData <- mutate(rawEventData, 
                         start_time = start_time / samplingRate,
                         duration = duration / samplingRate)
  
  message("Reading Template Data")
  d <- str_match(pattern = "_([12]D)_", string = status$loc_template)[,2]
  template <- do.call("rbind", bpmapply(.getBaseCalledSummary, files, dontCheck = FALSE, 
                                      strand = "template", d = d,
                                      SIMPLIFY = FALSE, USE.NAMES = FALSE))
  template <- mutate(template, id = readInfo[['id']])
  template <- filter(template, !(is.na(num_events)))
  
  message("Reading Template FASTQ")
  ## get the fastq for those that have it
  fq_t <- bpmapply(.getFastqString, files[ template[['id']] ], strand = "template", d = d)
  fq_t <- .processFastqVec(fq_t, readIDs = template[['id']], appendID = "_template")  
  fastq_template <- fq_t$fastq
  ## if there are any invalid entries we need to remove them
  if(length(fq_t$invalid)) {
    template <- template[-fq_t$invalid,]
  }
  baseCalled <- template
  
  if(nchar(status$loc_complement)) { 
    message("Reading Complement Data")
    d <- str_match(pattern = "_([12]D)_", string = status$loc_complement)[,2]
    complement <- do.call("rbind", bpmapply(.getBaseCalledSummary, files, dontCheck = FALSE, 
                                          strand = "complement", d = d,
                                          SIMPLIFY = FALSE, USE.NAMES = FALSE))
    complement <- mutate(complement, id = readInfo[['id']])
    complement <- filter(complement, !(is.na(num_events)))
    
    
    message("Reading Complement FASTQ")
    fq_c <- bpmapply(.getFastqString, files[ complement[['id']] ], strand = "complement", d = d)
    fq_c <- .processFastqVec(fq_c, readIDs = complement[['id']], appendID = "_complement")  
    fastq_complement <- fq_c$fastq
    
    ## if there are any invalid entries we need to remove them
    if(length(fq_c$invalid)) {
      complement <- complement[-fq_c$invalid,]
    }
    
  } else {
    fastq_complement <- ShortRead::ShortReadQ()
    complement <- NULL
  }
  
  ## we haven't read anything about 2D reads yet, so we need to identify which
  ## files have them.  Then we'll read only those
  idx2D <- which(sapply(files, .groupExistsString, group = paste0("/Analyses/Basecall_2D_000/BaseCalled_2D")))
  if(length(idx2D)) {
    message("Reading 2D FASTQ")
    fq_2D <- sapply(files[ idx2D ], .getFastqString, strand = "2D", d = "2D")
    fq_2D <- .processFastqVec(fq_2D, readIDs = readInfo[['id']][idx2D], appendID = "_2D")  
    fastq_2D <- fq_2D$fastq
  }
  
  ## We update the individual strands to indicate if they are part of a full 2D read
  template <- as_tibble(cbind(template, full_2D = template[['id']] %in% idx2D))
  complement <- as_tibble(cbind(complement, full_2D = complement[['id']] %in% idx2D))
  
  # ## combine the template, complement and 2D data
  baseCalled <- rbind(template, complement)
  fastq <- ShortRead::append(fastq_template, fastq_complement)
  if(length(idx2D)) {
    fastq <- ShortRead::append(fastq, fastq_2D)
  }
  
  message("Done")
  obj <- new("Fast5Summary", 
             readInfo = readInfo, 
             rawData = tibble(), 
             eventData = rawEventData, 
             baseCalled = baseCalled, 
             fastq = fastq,
             versions = list('IONiseR' = strsplit(as.character(packageVersion("IONiseR")),
                                                  ".",
                                                  fixed=TRUE)[[1]]
                             # 'MinKNOW' = max(versions))
             )
  )
  
  return(obj)
}