R/getCluster.R
In fcScan: fcScan for detecting clusters of coordinates with user defined options

Documented in getCluster

##' @title fcScan

##' @export getCluster

getCluster <- function(x, w, c, overlap = 0, greedy = FALSE, seqnames = NULL,
                    s = "*" , order = NULL, site_orientation = NULL, 
                    site_overlap = 0, verbose = FALSE) {

    sitesToExclude <- NULL
    final = NULL
    start.time = Sys.time()

    ##check arguments
    if (missing(x)) {
        stop("A data frame, Granges object or input files are needed")
    }

    ## Checking input format
    if (is(x, "data.frame") || is(x, "GRanges")) {
        ## print("data.frame input")
        if(is.null(names(c))) {
            stop("When input is a data frame or GRanges object, site names
in condition must be explicitly defined")
        }
    }
        
        if(is.data.frame(x)){
        x = makeGRangesFromDataFrame(x, keep.extra.columns = TRUE,
            starts.in.df.are.0based = TRUE)
    }

    else if(is(x, "GRanges")){
        x
    }    
    
    else {
        ## print("File(s) input")
        if (length(c) != length(x))
            stop("Condition and files should be of same length")
        
        if(length(which(names(c)=="")>0)){
            stop("You either give names to all conditions or leave them empty")
        }

        ##assigning names to condition if null
        if (is.null(names(c))) {
            names(c) = seq(from = 1,to = length(c),by = 1)
        }

        x = load_data(all_files = x, c = c)
        x <- makeGRangesFromDataFrame(x, keep.extra.columns = TRUE,
            starts.in.df.are.0based = TRUE)
    }
    
    if(length(which(duplicated(names(c))))!=0){
        stop("Names of conditions must be unique")
    }

    if(length(which(c<0))!=0 | !(is.numeric(c))){
        stop("Only positive integers are allowed")
    }

    ## overlap will accept integers only
    if(!(overlap%%1 == 0)){
        stop("Only integers are allowed for overlap")
    }

    indexToExclude <- which(c == 0)

    ## Getting conditions to exclude (i.e with 0 values)
    if(length(indexToExclude) != 0){
        sitesToExclude <- names(indexToExclude)
    }
    
    ## Test if window is a positive integer
    if (w < 0 | w%%1!=0)
        stop("Window size should be a positive integer")
    
    if (!(s %in% c("+", "-", "*")))
        stop('Strand needs to be a valid option. Accepted options are "+", "-"
                and "*"')

    ##if not greedy and order is given
    if(is.null(order) == FALSE){
    ##checking for consistency of name in order and condition
    if(!(all(order %in% names(c))))
        stop("site names between order and condition do not match")

    ##check if number of sites is equal betweeb condition and order
    if(greedy == FALSE){
        count_elements <- c(count(order)$freq)
        names(count_elements) <- count(order)$x
        if(!(all(sort(count_elements) == sort(c)))){
            stop("Greedy is set to FALSE and order is larger than condition")
            }
        }
    }
    
    #site orientation works only if order is provided
    if(!(is.null(site_orientation)) & is.null(order)){
        stop("site_orientation cannot be used unless order is specified")
    }

    #site orientation should be positive or negative
    if(!(all(site_orientation %in% c("+","-")))){
        stop('Site orientation should be "+" or "-"')
    }

    #site orientation input length should have same order length
    if(length(order) != length(site_orientation) & 
        !(is.null(site_orientation))){
        stop("When specified, sites orientation must be defined to
        all sites following 'order' option")
    }

    ##site_overlap should be either positive, negative or zero
    if(!(is.numeric(site_overlap) || site_overlap == 0)){
        stop("Only integers are allowed for distance between sites")
    }

    ##check verbose input argument
    if( !(verbose %in% c("TRUE", "FALSE"))) {
        stop("Verbose should be TRUE or FALSE")
    }

    ## n contains the total number of sites desired
    n = sum(c)

    ##getting sites found on the required seqnamesom
    ### change 2: subsetting from GRanges according to seqnames ###
    if (length(seqnames) > 0) {
        x = x[seqnames(x) %in% seqnames]
    }

    ##getting sites found on the required strand
    ### change 3: subsetting from GRanges according to strand ###
    if (s != "*") {
        x <- x[strand(x) == s]
    }

    ##check if the sites given in condition c are found in the data'
    if( !all(names(c) %in% x$site)) {
        message("Sites in condition do not match sites in data")
        return(NULL)
    }

    cat (length(x), " entries loaded", "\n")

    ##need to subset to keep only the sites required by the user,
    ##if the user wants sites "a","b" and input has "a", "b" and "c",
    ##we subset to get sites "a", "b"
    
    if(!(is.null(sitesToExclude))){
        x
    }
    
    else{
    x = subset(x, x$site %in% names(c))
    }
    
    unique_seqnames = unique((seqnames(x)))

    ## creating an array to fill results
    res = array(data = NA, dim = c(length(x), ncol = 7))
    colnames(res) = c("seqnames", "start", "end", "site", "strand",
                        "isCluster", "status")


    ## looping over chromosomes and call cluster_sites
    for(seq in seq_along(unique_seqnames)){

        gr = subset(x, seqnames == unique_seqnames[seq])
        gr = sort(gr, ignore.strand = TRUE)

        if (length(gr) >= n) {
            result = cluster_sites(gr, w, c, overlap, n,
                    res, s, greedy, order, sitesToExclude, 
                        site_orientation, site_overlap)
            final = rbind(final, result)
        }
    }

    if(length(final) !=0 ){
        final <- GRanges(
            seqnames = as.character(final[,1]), 
            ranges = IRanges(as.numeric(final[,2]), as.numeric(final[,3])),
            sites = as.character(final[,4]),
            strand = as.character(final[,5]),
            isCluster = as.logical(final[,6]),
            status = as.character(final[,7]))

            names(final) <- NULL
    }else{
        message("No cluster found")
        return(NULL)
    }

    ##if verbose is FALSE, get only clusters with "PASS"
    if(verbose == FALSE) {
        final = subset(final, final$status == "PASS")
    }

    ##if TRUE, get everything
    if(verbose == TRUE) {
        final
    }

    end.time = Sys.time()
    print(end.time - start.time)
    return(final)
}


load_data <- function(all_files, c) {
    df = NULL
    for(i in seq_along(all_files)){
        if(grepl("\\.bed$", all_files[i])){
            b = import(all_files[i])
        }
        else if(grepl("\\.vcf$|\\.vcf.gz$", all_files[i])){
            b = rowRanges(readVcf(all_files[i]))
        }
        else{
            stop("only .bed, .vcf and .vcf.gz extensions are accepted")
        }
        b$site = names(c[i])
        df = rbind(df, as.data.frame(b)[, c("seqnames", "start", "end",
                                            "strand", "site")])
    }
    df$strand[df$strand == "*"] <- "+"
    return(df)
}

## n contains the total number of sites desired
## s contains strand
## res is array for temporary results
cluster_sites <- function(gr, w, c, overlap, n, res, s, greedy, order,
                        sitesToExclude, site_orientation, site_overlap){

    start_site <- start(gr)
    end_site <- end(gr)
    end_check <- 0
    site <- gr$site
    exclusion_ls <- sitesToExclude
    strand <- as.vector(strand(gr))
    site_orientation_input <- site_orientation
    site_overlap_input <- site_overlap

    isCluster <- FALSE

    #upper_boundary controls looping limit
    if (greedy == FALSE) {
        # number of sites to be searched must be respected
        # and looping should be made to fit the number of sites in condition
        # because cluster should have max size of 'n' sites
        upper_boundary = length(gr) - n + 1
    } else {
        #greedy = TRUE, no limit on number of sites in the cluster
        upper_boundary = length(gr)
    }
    for (i in seq_len(upper_boundary)) {
        ## check for overlap. basically, the first new
        ## site should satisfy the overlap condition
        if (greedy == TRUE) {
            if (end_site[i] > start_site[i] + w) {
                next
            }
        }
        ## 
        if (isCluster) {
            if ((start_site[i] - end) < overlap) {
                next
            } else {
                isCluster = FALSE
            }
        }
        if (greedy == TRUE) {
            end = end(gr)[end(gr) <= start_site[i] + w]
            end = end[length(end)]
            if(end_check == 0){
                end_check = end
                iEnd = which(end(gr) == end)[1]
                # Cluster of sites to be checked (ls),
                # their orientation (so),
                # their start and end coordinates (sc and ec respectively)
                ls <- site[i:iEnd]
                so <- strand[i:iEnd]
                sc <- start_site[i:iEnd]
                ec <- end_site[i:iEnd]
            }
            else if(end_check == end & is.null(exclusion_ls)){
                next
            }
            else {
                end_check = end
                iEnd = which(end(gr) == end)[1]
                ls <- site[i:iEnd]
                so <- strand[i:iEnd]
                sc <- start_site[i:iEnd]
                ec <- end_site[i:iEnd]
            }
        }
        else {
            end = max((end_site[i:(i + n - 1)]))
        }
        ## putative cluster is bigger than window
        if (greedy == FALSE) {
            if ((end - start_site[i]) > w) {
                next
            } else {
                ls <- site[i:(i + n - 1)]
                so <- strand[i:(i + n - 1)]
                sc <- start_site[i:(i + n -1)]
                ec <- end_site[i:(i + n -1)]
            }
        }

        ## checking if required sites are present
        ans <- testCombn(ls, c, order, exclusion_ls,so,site_orientation_input, 
                sc, ec, site_overlap_input)
        isCluster = ans$logical
        status = ans$status

        ## if we get combnFail, skip
        if(status == "combnFail"){
            next
        }

        res[i, "seqnames"] <- as.character(seqnames(gr)[i])
        res[i, "start"] <- start_site[i]
        res[i, "end"] <- max(ec)
        res[i, "strand"] <- s
        res[i, "status"] <- ans$status
        if (greedy == TRUE) {
            res[i, "site"] <- paste(site[i:iEnd], collapse = ",")
        } else{
            res[i, "site"] <- paste(site[i:(i + n - 1)], collapse = ",")
        }
        res[i, "isCluster"] <- isCluster
    }
    
    res <- res[complete.cases(res), ]

    #Case with one entry, Converting vector to matrix prior to return
    if (is(res, "character")){
        res.names <- names(res)
        res = matrix(res, 1, length(res))
        colnames(res) <- res.names
    }
    return(res)
}

testCombn <- function(ls, c, order, sitesToExclude, so, s_orientation, 
                    start_site, end_site, s_overlap) {
    ans <- list()
    temp <- NULL
    site_orientation_check <- FALSE
    site_overlap_check <- FALSE

    #check for minimum number of sites if satisfying condition
    for (key in names(c)) {
        if (sum((ls == key)) < c[key]) {
            ans$logical = FALSE
            ans$status = "combnFail"
            return(ans)
        }
    }

    if(is.null(order)){ ## order doesn't matter
        ans$logical = TRUE
        ans$status = "PASS"
    }else{ # check for order validity
        if(grepl(paste(order,collapse=";"),paste(ls,collapse=";")) == TRUE 
            & is.null(s_orientation))
        {
            ans$logical = TRUE
            ans$status = "PASS"
        }
        # check for sites orientation validity 
        else if(grepl(paste(order,collapse=";"),paste(ls,collapse=";")) == TRUE 
            & !(is.null(s_orientation))){
            for(i in seq_along(ls)){#use of seq_along as advised by BiocCheck
                temp <- as.vector(na.omit(ls[i:(i+length(order)-1)]))
                if((length(temp) == length(order)) && (all(temp == order)) && 
                    (all(as.vector(na.omit(so[i:(i+length(order)-1)] 
                        == s_orientation))))){
                    site_orientation_check <- TRUE
            }}
            if(site_orientation_check){
            ans$logical = TRUE
            ans$status = "PASS"
        }else{
            ans$logical = FALSE
            ans$status = "siteOrientation"
            }
        }
        else{
            ans$logical = FALSE
            ans$status = "orderFail"
        }
    }
    
    # check for excluded sites - zero sites in condition
    if(!(is.null(sitesToExclude))){
        for (exc_site in sitesToExclude){
            if(length(grep(exc_site, ls, value = TRUE))>0){
            ans$logical = FALSE
            ans$status = "excludedSites"
        }
        }
    }

    # check for distance between sites within cluster
    if(s_overlap != 0){
        if(s_overlap > 0){
        #if s_overlap is positive, 
        #it means sites should have min distance and above
            for(i in seq_len(length(start_site)-1)){
                if((cbind(start_site, end_site)[(i+1),1]) - 
                    (cbind(start_site, end_site)[(i),2]) < s_overlap){
                    site_overlap_check <- TRUE
                    break
                }
            }
        }

        #if s_overlap is negative, 
        #it means sites should have max distance and below
        else if(s_overlap < 0){
            for(i in seq_len(length(start_site)-1)){
                if((cbind(start_site, end_site)[(i+1),1]) - 
                    (cbind(start_site, end_site)[(i),2]) > abs(s_overlap)){
                    site_overlap_check <- TRUE
                    break
                }
            }
        }
        
        if(site_overlap_check){
            if(ans$status == "PASS"){
            ans$status = "siteOverlap"
            ans$logical = FALSE
            }
        }
    }
    return(ans)
}