R/xGRsampling.r

#' Function to generate random samples for data genomic regions from background genomic regions
#'
#' \code{xGRsampling} is supposed to randomly generate samples for data genomic regions from background genomic regions. To do so, we first identify background islands, that is, non-overlapping regions. Then, we keep only parts of data genomic regions that fall into these background islands. For each kept genomic region, a randomised region of the same length is sampled from the corresponding background islands. If required, the randomised region can be restricted to be no more than (eg 10000bp) away from data genomic regions. 
#'
#' @param GR.data an input data GR object, containing a set of genomic regions based on which to generate a null distribution
#' @param GR.background an input background GR object, containing a set of genomic regions to randomly sample from. It can be a GR list object or a list of GR objects
#' @param num.samples the number of samples randomly generated
#' @param gap.max the maximum distance of background islands to be considered away from data regions. Only background islands no far way from this distance will be considered. For example, if it is 0, meaning that only background islands that overlapp with genomic regions will be considered. By default, it is 50000
#' @param max.distance the maximum distance away from data regions that is allowed when generating random samples. By default, it is NULl meaning no such restriction
#' @param verbose logical to indicate whether the messages will be displayed in the screen. By default, it sets to false for no display
#' @param RData.location the characters to tell the location of built-in RData files. See \code{\link{xRDataLoader}} for details
#' @return 
#' a list of GR ojects, each containing an GR oject storing a sample.
#' @export
#' @seealso \code{\link{xGRsampling}}
#' @include xGRsampling.r
#' @examples
#' \dontrun{
#' # Load the XGR package and specify the location of built-in data
#' library(XGR)
#' RData.location <- "http://galahad.well.ox.ac.uk/bigdata"
#' 
#' # Enrichment analysis for GWAS SNPs from ImmunoBase
#' # a) provide input data GR object storing GWAS SNPs
#' dbSNP_GWAS <- xRDataLoader(RData.customised='dbSNP_GWAS', RData.location=RData.location)
#' 
#' # b) provide backgorund data GR object storing FANTOM5 cell-specific enhancers
#' FANTOM5_Enhancer_Cell <- xRDataLoader(RData.customised='FANTOM5_Enhancer_Cell', RData.location=RData.location)
#' 
#' # c) generate random samples as a list of GR objects
#' sGR_List <- xGRsampling(GR.data=dbSNP_GWAS, GR.background=FANTOM5_Enhancer_Cell, num.samples=1000, RData.location=RData.location)
#' }

xGRsampling <- function(GR.data, GR.background, num.samples=100, gap.max=50000, max.distance=NULL, verbose=T, RData.location="http://galahad.well.ox.ac.uk/bigdata")
{

	if(is.null(max.distance)){
		max.distance <- gap.max
	}else if(max.distance > gap.max){
		max.distance <- gap.max
	}
	
  	## Check input GR.data and GR.background
  	if (class(GR.data) != "GRanges") {
    	stop("The function must apply to a 'GRanges' object for input data.\n")
  	}
  	
  	if(class(GR.background) == "GRangesList"){
  		GR.background <- BiocGenerics::unlist(GR.background)
  	}else if(class(GR.background) == "list"){
  		GR.background <- BiocGenerics::unlist(GenomicRanges::GRangesList(GR.background))
  	}
  	if (class(GR.background) != "GRanges") {
    	stop("The function must apply to a 'GRanges' object for input background.\n")
  	}
  	
	if(verbose){
		now <- Sys.time()
		message(sprintf("First, get non-overlapping regions for both input data and background (%s) ...", as.character(now)), appendLF=T)
	}
    ## get reduced ranges (ie non-overlapping regions)
    ### data GR
    dGR_reduced <- IRanges::reduce(GR.data)
    ### background GR
    bGR_reduced <- IRanges::reduce(GR.background)
	if(verbose){
		now <- Sys.time()
		message(sprintf("\tnon-overlapping regions: %d for data, %d for background", length(dGR_reduced), length(bGR_reduced)), appendLF=T)
	}
    
	if(verbose){
		now <- Sys.time()
		message(sprintf("Second, keep only data regions that are within background regions (%s) ...", as.character(now)), appendLF=T)
	}
	#####################################
	## A function to return an GR object storing overlapped regions (ie only overlapped regions!)
	mergeOverlaps <- function(qGR, sGR, maxgap=-1L, minoverlap=0L){
		hits <- as.matrix(as.data.frame(GenomicRanges::findOverlaps(query=qGR, subject=sGR, maxgap=maxgap, minoverlap=minoverlap, type="any", select="all", ignore.strand=T)))
		qhits <- qGR[hits[,1]]
		shits <- sGR[hits[,2]]

		oGR <- IRanges::pintersect(qhits, shits)
		IRanges::reduce(oGR)
	}
	#####################################
	## update data GR after considering background
	dGR_reduced <- mergeOverlaps(qGR=dGR_reduced, sGR=bGR_reduced, maxgap=-1L, minoverlap=0L)
	if(verbose){
		now <- Sys.time()
		message(sprintf("\t%d within background", length(dGR_reduced)), appendLF=T)
	}

	if(verbose){
		now <- Sys.time()
		message(sprintf("Third, find background islands that contain data regions (%s) ...", as.character(now)), appendLF=T)
	}
	## find islands
	hits <- as.matrix(as.data.frame(GenomicRanges::findOverlaps(query=dGR_reduced, subject=GR.background, maxgap=gap.max-1, minoverlap=0L, type="any", select="all", ignore.strand=T)))
	ind_data <- hits[,1]
	ind_background <- hits[,2]
	dt_ls <- split(x=ind_background, f=ind_data)
	
	if(verbose){
		now <- Sys.time()
		message(sprintf("\t%d background islands", length(unique(ind_background))), appendLF=T)
	}
    
    if(is.null(max.distance)){
		if(verbose){
			now <- Sys.time()
			message(sprintf("Fourth, define the sampling range (%s) ...", as.character(now)), appendLF=T)
		}
    }else{
		max.distance <- as.integer(max.distance)
		if(verbose){
			now <- Sys.time()
			message(sprintf("Fourth, define the sampling range within %d bp distance away from data regions (%s) ...", max.distance, as.character(now)), appendLF=T)
		}
    }
	## convert into data.frame
    df_data <- GenomicRanges::as.data.frame(dGR_reduced, row.names=NULL)
    df_background <- GenomicRanges::as.data.frame(GR.background, row.names=NULL)
    range_ls <- lapply(1:length(dt_ls), function(i){
    	df_dt <- df_data[as.numeric(names(dt_ls)[i]), ]
    	# data width
    	dw <- df_dt$width
    	
    	df_bg <- df_background[dt_ls[[i]], ]
    	res <- lapply(1:nrow(df_bg), function(j){
			# background start and end
			bs <- df_bg$start[j]
			be <- df_bg$end[j]
			
			# range start and end
			rs <- bs
			re <- be-dw
			# within the distance restriction
			if(!is.null(max.distance)){
				if(rs < df_dt$start - max.distance){
					rs <- df_dt$start - max.distance
				}
				if(re > df_dt$end + max.distance){
					re <- df_dt$end + max.distance
				}
			}
			seq(rs, re)
    	})
    	unlist(res)
    })
    
	if(verbose){
		now <- Sys.time()
		message(sprintf("Fifth, do '%d' sampling for the starting points (%s) ...", num.samples, as.character(now)), appendLF=T)
	}
    res_ls <- lapply(range_ls, function(x){
    	base::sample(x, num.samples, replace=T)
    })
    df_samples <- do.call(rbind, res_ls)
    
	if(verbose){
		now <- Sys.time()
		message(sprintf("Finally, construct GR for each sampling (%s) ...", as.character(now)), appendLF=T)
	}
	## 'df_dt_all' for all data regions   
	df_dt_all <- df_data[as.numeric(names(dt_ls)), ]
    sGR_list <- lapply(1:ncol(df_samples), function(j){
		sGR <- GenomicRanges::GRanges(
			seqnames=S4Vectors::Rle(df_dt_all$seqnames),
			ranges = IRanges::IRanges(start=df_samples[,j], end=df_samples[,j]+df_dt_all$width-1),
			strand = S4Vectors::Rle(df_dt_all$strand)
		)
    })

	invisible(sGR_list)
}

Try the XGR package in your browser

Any scripts or data that you put into this service are public.

XGR documentation built on June 18, 2019, 3:01 p.m.