# Copied and modified from Matrix eQTL by Andrey A. Shabalin
# http://www.bios.unc.edu/research/genomic_software/Matrix_eQTL/
# http://cran.r-project.org/web/packages/policies.html
#library(methods)
modelLINEAR = 117348L;
modelANOVA = 47074L;
modelLINEAR_CROSS = 1113461L;
.seq = function(a,b){if(a<=b){a:b}else{integer(0)}};
# @exportClass SlicedData
# @exportMethod [[ [[<- colnames colnames<- dim length ncol NCOL nrow NROW rownames rownames<- as.matrix rowMeans rowSums colMeans colSums
#' @importFrom methods setRefClass
SlicedData <- setRefClass( "SlicedData",
fields = list(
dataEnv = "environment",
nSlices1 = "numeric",
rowNameSlices = "list",
columnNames = "character",
fileDelimiter = "character",
fileSkipColumns = "numeric",
fileSkipRows = "numeric",
fileSliceSize = "numeric",
fileOmitCharacters = "character"
),
methods = list(
initialize = function( mat = NULL ) {
dataEnv <<- new.env(hash = TRUE, size = 29L);
nSlices1 <<- 0L;
if(!is.null(mat)) {
CreateFromMatrix(mat);
}
fileSliceSize <<- 1000;
fileDelimiter <<- "\t";
fileSkipColumns <<- 1L;
fileSkipRows <<- 1L;
fileOmitCharacters <<- "NA"
return(invisible(.self));
},
CreateFromMatrix = function( mat ) {
stopifnot( class(mat) == "matrix" );
setSliceRaw( 1L ,mat );
rns = rownames( mat, do.NULL = FALSE);
#if( is.null(rns) ) {
# rns = paste( "Row_",(1:nrow(mat)), sep="" );
#}
rowNameSlices <<- list(rns);
cns = colnames( mat, do.NULL = FALSE );
#if( is.null(cns) ){
# cns = paste( "Col_",(1:ncol(mat)), sep="" );
#}
columnNames <<- cns;
return(invisible(.self));
},
getSlice = function(sl) {
value = get(paste(sl), dataEnv);
if( is.raw(value) ) {
storage.mode(value) = "double";
value[value == 255] = NA;
}
return( value )
},
getSliceRaw = function(sl) {
return( get(paste(sl), dataEnv) )
},
setSliceRaw = function(sl, value) {
assign( paste(sl), value, dataEnv )
if( nSlices1 < sl ) {
nSlices1 <<- sl;
}
},
setSlice = function(sl, value) {
if( length(value) > 0 ) {
if( all(as.integer(value) == value, na.rm = TRUE) ) {
if( (min(value, na.rm = TRUE) >= 0 ) &&
(max(value, na.rm = TRUE) < 255) )
{
nv = value;
suppressWarnings({storage.mode(nv) = "raw"});
nv[ is.na(value)] = as.raw(255);
value = nv;
} else {
storage.mode(value) = "integer";
}
}
}
setSliceRaw(sl, value);
},
nSlices = function() {
return( nSlices1 );
},
LoadFile = function(filename, skipRows = NULL, skipColumns = NULL, sliceSize = NULL, omitCharacters = NULL, delimiter = NULL, rowNamesColumn = 1) {
if( !is.null(skipRows) ) {
fileSkipRows <<- skipRows;
}
if( !is.null(skipColumns) ) {
fileSkipColumns <<- skipColumns;
}
if( !is.null(omitCharacters) ) {
fileOmitCharacters <<- omitCharacters;
}
if( !is.null(sliceSize) ) {
fileSliceSize <<- sliceSize;
}
if( !is.null(delimiter) ) {
fileDelimiter <<- delimiter;
}
stopifnot( (fileSkipColumns == 0) || (rowNamesColumn <= fileSkipColumns) )
stopifnot( (fileSkipColumns == 0) || (rowNamesColumn >= 1) )
fid = file(description = filename, open = "rt", blocking = FALSE, raw = FALSE)
# clean object if file is open
Clear();
lines = readLines(con = fid, n = max(fileSkipRows,1L), ok = TRUE, warn = TRUE)
line1 = tail(lines,1);
splt = strsplit(line1, split = fileDelimiter, fixed = TRUE);
if( fileSkipRows > 0L ) {
columnNames <<- splt[[1]]; # [ -(1:fileSkipColumns) ];
} else {
seek(fid, 0)
}
rm( lines, line1, splt );
rowNameSlices <<- vector("list", 15);
curSliceId = 0L;
repeat
{
# preallocate names and data
if(length(rowNameSlices) < curSliceId) {
rowNameSlices[[2L*curSliceId]] <<- NULL;
}
curSliceId = curSliceId + 1L;
# read sliceSize rows
rowtag = vector("character",fileSliceSize);
rowvals = vector("list",fileSliceSize);
for(i in 1:fileSliceSize) {
temp = "";
if( fileSkipColumns > 0L ) {
temp = scan(file = fid, what = character(), n = fileSkipColumns, quiet = TRUE,sep = fileDelimiter);
}
rowtag[i] = temp[rowNamesColumn];#paste(temp,collapse=" ");
rowvals[[i]] = scan(file = fid, what = double(), nlines = 1, quiet = TRUE, sep = fileDelimiter, na.strings = fileOmitCharacters);
if( length(rowvals[[i]]) == 0L ) {
if(i==1L) {
rowtag = matrix(0, 0, 0);
rowvals = character(0);
} else {
rowtag = rowtag[ 1:(i-1) ];
rowvals = rowvals[ 1:(i-1) ];
}
break;
}
}
if( length(rowtag) == 0L ) {
curSliceId = curSliceId - 1L;
break;
}
rowNameSlices[[curSliceId]] <<- rowtag;
data = c(rowvals, recursive = TRUE);
dim(data) = c(length(rowvals[[1]]), length(rowvals));
data = t(data);
setSlice(curSliceId, data);
if( length(rowtag) < fileSliceSize ) {
break;
}
numtxt = formatC(curSliceId*fileSliceSize, big.mark=",", format = "f", digits = 0)
#cat( "Rows read: ", numtxt, "\n");
flush.console()
}
close(fid)
if( fileSkipRows == 0 ) {
columnNames <<- paste("Col_", (1:nCols()), sep="");
} else {
columnNames <<- tail(columnNames, ncol(getSliceRaw(1)));
}
if( fileSkipColumns == 0 ) {
cnt = 0L;
for( sl in 1:nSlices() ) {
nr = length(getSliceRaw(sl));
rowNameSlices[[sl]] <<- paste("Row_",cnt + (1:nr),sep="");
cnt = cnt + nr;
}
}
rowNameSlices <<- rowNameSlices[1:curSliceId];
#cat("Rows read: ", nRows(), " done.\n");
return(invisible(.self));
},
SaveFile = function(filename) {
if( nSlices() == 0 ) {
cat("No data to save");
return();
}
fid = file(filename,"wt");
for( sl in 1:nSlices() ) {
z = getSlice(sl);
rownames(z) = rowNameSlices[[sl]];
colnames(z) = columnNames;
write.table(z, file = fid, sep = "\t",
col.names = (if(sl == 1){NA}else{FALSE}));
}
close(fid);
},
nRows = function() {
s = 0L;
for(sl in .seq(1,nSlices())) {
s = s + nrow(getSliceRaw(sl));
}
return( s )
},
nCols = function() {
if( nSlices() == 0L ) {
return(0L);
} else {
return( ncol(getSliceRaw(1L)) )
}
},
Clear = function() {
for( sl in .seq(1,nSlices()) ) {
rm(list = paste(sl), envir = dataEnv)
}
nSlices1 <<- 0L;
rowNameSlices <<- list();
columnNames <<- character();
return(invisible(.self));
},
IsCombined = function() {
return( nSlices() <= 1L );
},
GetAllRowNames = function() {
return( c(rowNameSlices, recursive=TRUE) );
},
GetNRowsInSlice = function(sl) {
return( length( rowNameSlices[[sl]] ) );
},
SetNanRowMean = function() {
if( (nCols() == 0L) ) {
return(invisible(.self));
}
for( sl in .seq(1,nSlices()) ) {
slice = getSlice(sl);
if( any(is.na(slice)) ) {
rowmean = rowMeans(slice, na.rm = TRUE);
rowmean[is.na(rowmean)] = 0L;
for( j in which(!complete.cases(slice)) ) {
where1 = is.na(slice[j, ]);
slice[j, where1] = rowmean[j];
}
setSlice(sl, slice);
}
}
return(invisible(.self));
},
RowStandardizeCentered = function() {
for(sl in .seq(1,nSlices()) ) {
slice = getSlice(sl);
div = sqrt( rowSums(slice^2) );
div[ div == 0 ] = 1;
setSlice(sl, slice/div);
}
return(invisible(.self));
},
CombineInOneSlice = function() {
if( nSlices() <= 1L ) {
return(invisible(.self));
}
nc = nCols();
nr = nRows();
datatypes = c("raw","integer","double");
datafuns = c(as.raw, as.integer, as.double);
datatype = character(nSlices());
for(sl in 1:nSlices()) {
datatype[sl] = typeof(getSliceRaw(sl));
}
mch = max(match(datatype,datatypes,nomatch = length(datatypes)));
datafun = datafuns[[mch]];
newData = matrix(datafun(0), nrow = nr, ncol = nc);
offset = 0;
for(sl in 1:nSlices()) {
if(mch==1) {
slice = getSliceRaw(sl);
} else {
slice = getSlice(sl);
}
newData[ offset + (1:nrow(slice)),] = datafun(slice);
setSlice(sl, numeric());
offset = offset + nrow(slice);
}
nSlices1 <<- 1L;
setSliceRaw(1L, newData);
rm(newData);
newrowNameSlices = GetAllRowNames();
rowNameSlices <<- list(newrowNameSlices)
return(invisible(.self));
},
ResliceCombined = function(sliceSize = -1) {
if( sliceSize > 0L ) {
fileSliceSize <<- sliceSize;
}
if( fileSliceSize <= 0 ) {
fileSliceSize <<- 1000;
}
if( IsCombined() ) {
nRows1 = nRows();
if(nRows1 == 0L) {
return(invisible(.self));
}
newNSlices = floor( (nRows1 + fileSliceSize - 1)/fileSliceSize );
oldData = getSliceRaw(1L);
#oldNames = rowNameSlices[[1]];
newNameslices = vector("list",newNSlices)
for( sl in 1:newNSlices ) {
range = (1+(sl-1)*fileSliceSize) : (min(nRows1,sl*fileSliceSize));
newpart = oldData[range, ,drop = FALSE];
if( is.raw(oldData) ) {
setSliceRaw( sl, newpart);
} else {
setSlice( sl, newpart);
}
newNameslices[[sl]] = rowNameSlices[[1]][range];
}
rowNameSlices <<- newNameslices ;
} else {
stop("Reslice of a sliced matrix is not supported yet. Use CombineInOneSlice first.");
}
return(invisible(.self));
},
Clone = function() {
clone = SlicedData$new();
for(sl in .seq(1,nSlices()) ) {
clone$setSliceRaw(sl,getSliceRaw(sl));
}
clone$rowNameSlices = rowNameSlices;
clone$columnNames = columnNames;
clone$fileDelimiter = fileDelimiter;
clone$fileSkipColumns = fileSkipColumns;
clone$fileSkipRows = fileSkipRows;
clone$fileSliceSize = fileSliceSize;
clone$fileOmitCharacters = fileOmitCharacters;
return( clone );
},
RowMatrixMultiply = function(multiplier) {
for(sl in .seq(1,nSlices()) ) {
setSlice(sl, getSlice(sl) %*% multiplier);
}
return(invisible(.self));
},
ColumnSubsample = function(subset) {
for(sl in .seq(1,nSlices()) ) {
setSliceRaw(sl, getSliceRaw(sl)[ ,subset, drop = FALSE]);
}
columnNames <<- columnNames[subset];
return(invisible(.self));
},
RowReorderSimple = function(ordr) {
# had to use an inefficient and dirty method
# due to horrible memory management in R
if( (typeof(ordr) == "logical") && all(ordr) ) {
return(invisible(.self));
}
if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) {
return(invisible(.self));
}
CombineInOneSlice();
gc();
setSliceRaw( 1L, getSliceRaw(1L)[ordr, ] );
rowNameSlices[[1]] <<- rowNameSlices[[1]][ordr];
gc();
ResliceCombined();
gc();
return(invisible(.self));
},
RowReorder = function(ordr) {
# transform logical into indices
if( typeof(ordr) == "logical" ) {
if( length(ordr) == nRows() ) {
ordr = which(ordr);
} else {
stop("Parameter \"ordr\" has wrong length")
}
}
## first, check that anything has to be done at all
if( (length(ordr) == nRows()) && all(ordr == (1:length(ordr))) ) {
return(invisible(.self));
}
## check bounds
#if( (min(ordr) < 1) || (max(ordr) > nRows()) ) {
# stop("Parameter \"ordr\" is out of bounds");
#}
## slice the data into individual rows
all_rows = vector("list", nSlices())
for( i in 1:nSlices() ) {
slice = getSliceRaw(i)
all_rows[[i]] = split(slice, 1:nrow(slice))
setSliceRaw(i,numeric())
}
gc();
all_rows = unlist(all_rows, recursive=FALSE, use.names = FALSE);
## Reorder the rows
all_rows = all_rows[ordr];
## get row names
all_names = GetAllRowNames();
## erase the set
rowNameSlices <<- list();
## sort names
all_names = all_names[ordr];
##
## Make slices back
nrows = length(all_rows);
nSlices1 <<- as.integer((nrows+fileSliceSize-1)/fileSliceSize);
##cat(nrows, " ", nSlices1);
rowNameSlices1 = vector("list", nSlices1);
for( i in 1:nSlices1 ) {
fr = 1 + fileSliceSize*(i-1);
to = min( fileSliceSize*i, nrows);
subset = all_rows[fr:to];
types = unlist(lapply(subset,typeof));
israw = (types == "raw")
if(!all(israw == israw[1])) {
# some raw and some are not
subset = lapply(subset, function(x){if(is.raw(x)){x=as.integer(x);x[x==255] = NA;return(x)}else{return(x)}});
}
subset = unlist(subset);
dim(subset) = c( length(all_rows[[fr]]) , to - fr + 1)
#subset = matrix(subset, ncol = (to-fr+1));
if(is.raw(subset)) {
setSliceRaw(i, t(subset));
} else {
setSlice(i, t(subset));
}
rowNameSlices1[[i]] = all_names[fr:to];
all_rows[fr:to] = 0;
all_names[fr:to] = 0;
}
rowNameSlices <<- rowNameSlices1;
gc();
return(invisible(.self));
},
RowRemoveZeroEps = function(){
for(sl in .seq(1,nSlices()) ) {
slice = getSlice(sl);
amean = rowMeans(abs(slice));
remove = (amean < .Machine$double.eps*nCols());
if(any(remove)) {
rowNameSlices[[sl]] <<- rowNameSlices[[sl]][!remove];
setSlice(sl, slice[!remove, , drop = FALSE]);
}
}
return(invisible(.self));
},
FindRow = function(rowname) {
for(sl in .seq(1,nSlices()) ) {
mch = match(rowname,rowNameSlices[[sl]], nomatch = 0);
if( mch > 0 )
{
row = getSlice(sl)[mch[1], , drop=FALSE];
rownames(row) = rowname;
colnames(row) = columnNames;
return( list(slice = sl, item = mch, row = row) );
}
}
return( NULL );
},
show = function() {
#cat("SlicedData object. For more information type: ?SlicedData\n");
#cat("Number of columns:", nCols(), "\n");
#cat("Number of rows:", nRows(), "\n");
#cat("Data is stored in", nSlices(), "slices\n");
if(nCols()>0) {
z = getSlice(1L);
if(nrow(z)>0) {
z = z[1:min(nrow(z),10L), 1:min(ncol(z),10L), drop = FALSE];
rownames(z) = rowNameSlices[[1]][1:nrow(z)];
colnames(z) = columnNames[1:ncol(z)];
#cat("Top left corner of the first slice (up to 10x10):\n");
methods:::show(z)
}
}
}
))
setGeneric("nrow")
setMethod("nrow", "SlicedData", function(x) {
return( x$nRows() );
})
setGeneric("NROW")
setMethod("NROW", "SlicedData", function(x) {
return( x$nRows() );
})
setGeneric("ncol")
setMethod("ncol", "SlicedData", function(x) {
return( x$nCols() );
})
setGeneric("NCOL")
setMethod("NCOL", "SlicedData", function(x) {
return( x$nCols() );
})
setGeneric("dim")
setMethod("dim", "SlicedData", function(x) {
return( c(x$nRows(),x$nCols()) );
})
setGeneric("colnames")
setMethod("colnames", "SlicedData", function(x) {
return( x$columnNames );
})
setGeneric("rownames")
setMethod("rownames", "SlicedData", function(x) {
return( x$GetAllRowNames() );
})
setMethod("[[", "SlicedData", function(x,i) {
return( x$getSlice(i) );
})
setGeneric("length")
setMethod("length", "SlicedData", function(x) {
return( x$nSlices() );
})
setMethod("[[<-", "SlicedData", function(x,i,value) {
x$setSlice(i, value);
return(x);
})
summary.SlicedData = function(object, ...) {
z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices());
return(z);
}
##### setGeneric("summary") #####
#setMethod("summary", "SlicedData", function(object, ...) {
# z = c(nCols = object$nCols(), nRows = object$nRows(), nSlices = object$nSlices());
# return(z);
# })
#setGeneric("show", standardGeneric("show"))
# setMethod("show", "SlicedData", function(object) {
# cat("SlicedData object. For more information type: ?SlicedData\n");
# cat("Number of columns:", object$nCols(), "\n");
# cat("Number of rows:", object$nRows(), "\n");
# cat("Data is stored in", object$nSlices(), "slices\n");
# if(object$nSlices()>0) {
# z = object$getSlice(1);
# if(nrow(z)>0) {
# z = z[1:min(nrow(z),10), 1:min(ncol(z),10), drop = FALSE];
# rownames(z) = object$rowNameSlices[[1]][1:nrow(z)];
# colnames(z) = object$columnNames[1:ncol(z)];
# cat("Top left corner of the first slice (up to 10x10):\n");
# show(z)
# }
# }
# })
setGeneric("as.matrix")
setMethod("as.matrix", "SlicedData", function(x) {
if(x$nSlices() == 0) {
return( matrix(0,0,0) );
}
if(x$nSlices() > 1) {
copy = x$Clone();
copy$CombineInOneSlice();
} else {
copy = x;
}
mat = copy$getSlice(1L);
rownames(mat) = rownames(copy);
colnames(mat) = colnames(copy);
return( mat );
})
setGeneric("colnames<-")
setMethod("colnames<-", "SlicedData", function(x,value) {
stopifnot( class(value) == "character" );
stopifnot( length(value) == x$nCols() );
x$columnNames = value;
return(x);
})
setGeneric("rownames<-")
setMethod("rownames<-", "SlicedData", function(x,value) {
stopifnot( class(value) == "character" );
stopifnot( length(value) == x$nRows() );
start = 1;
newNameSlices = vector("list", x$nSlices());
for( i in .seq(1,x$nSlices()) ) {
nr = nrow(x$getSliceRaw(i));
newNameSlices[[i]] = value[ start:(start+nr-1) ];
start = start + nr;
}
x$rowNameSlices = newNameSlices;
return(x);
})
setGeneric("rowSums")
setMethod("rowSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
if(x$nSlices() == 0) {
return( numeric() );
}
stopifnot( dims == 1 );
thesum = vector("list", x$nSlices());
for( i in 1:x$nSlices() ) {
thesum[[i]] = rowSums(x$getSlice(i), na.rm)
}
return(unlist(thesum, recursive = FALSE, use.names = FALSE));
})
setGeneric("rowMeans")
setMethod("rowMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
if(x$nSlices() == 0) {
return( numeric() );
}
stopifnot( dims == 1 );
thesum = vector("list", x$nSlices());
for( i in 1:x$nSlices() ) {
thesum[[i]] = rowMeans(x$getSlice(i), na.rm)
}
return(unlist(thesum, recursive = FALSE, use.names = FALSE));
})
setGeneric("colSums")
setMethod("colSums", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
if(x$nCols() == 0) {
return( numeric() );
}
stopifnot( dims == 1 );
thesum = 0;
for( i in .seq(1,x$nSlices()) ) {
thesum = thesum + colSums(x$getSlice(i), na.rm)
}
return(thesum);
})
setGeneric("colMeans")
setMethod("colMeans", "SlicedData", function(x, na.rm = FALSE, dims = 1L) {
if(x$nCols() == 0) {
return( numeric() );
}
stopifnot( dims == 1 );
thesum = 0;
thecounts = x$nRows();
for( i in .seq(1,x$nSlices()) ) {
slice = x$getSlice(i);
thesum = thesum + colSums(slice, na.rm)
if( na.rm ) {
thecounts = thecounts - colSums(is.na(slice))
}
}
return(thesum/thecounts);
})
.listBuilder <- setRefClass(".listBuilder",
fields = list(
dataEnv = "environment",
n = "integer"
),
methods = list(
initialize = function() {
dataEnv <<- new.env(hash = TRUE);
n <<- 0L;
# cumlength <<- 0;
return(.self);
},
add = function(x) {
if(length(x) > 0) {
n <<- n + 1L;
# cumlength <<- cumlength + length(x);
assign(paste(n), x, dataEnv );
}
return(.self);
},
set = function(i,x) {
i = as.integer(i);
if(length(x) > 0) {
if(i>n)
n <<- i;
assign(paste(i), x, dataEnv );
}
return(.self);
},
get = function(i) {
return(base::get(paste(i),dataEnv));
},
list = function() {
if(n==0) return(list());
result = vector("list",n);
for( i in 1:n) {
result[[i]] = .self$get(i);
}
return(result);
},
unlist = function() {
return(base::unlist(.self$list(), recursive=FALSE, use.names = FALSE));
},
show = function() {
#cat(".listBuilder object.\nIternal object in MatrixEQTL package.\n");
#cat("Number of elements:", .self$n, "\n");
}
))
.histogrammer <- setRefClass(".histogrammer",
fields = list(
pvbins1 = "numeric",
statbins1 = "numeric",
hist.count = "numeric"
),
methods = list(
initialize = function (pvbins, statbins) {
if(length(pvbins)) {
ord = order(statbins);
pvbins1 <<- pvbins[ord];
statbins1 <<- statbins[ord];
statbins1[length(statbins1)] <<- .Machine$double.xmax;
hist.count <<- double(length(pvbins)-1);
}
return(.self);
},
update = function(stats.for.hist) {
h = hist(stats.for.hist, breaks = statbins1, include.lowest = TRUE, right = TRUE, plot = FALSE)$counts;
hist.count <<- hist.count + h;
},
getResults = function() {
if(!is.unsorted(pvbins1)) {
return(list(hist.bins = pvbins1 , hist.counts = hist.count ));
} else {
return(list(hist.bins = rev(pvbins1), hist.counts = rev(hist.count)));
}
}
))
.minpvalue <- setRefClass(".minpvalue",
fields = list(
sdata = ".listBuilder",
gdata = ".listBuilder"
),
methods = list(
initialize = function(snps, gene) {
sdata <<- .listBuilder$new();
for( ss in 1:snps$nSlices() ) {
sdata$set( ss, double(snps$GetNRowsInSlice(ss)));
}
gdata <<- .listBuilder$new();
for( gg in 1:gene$nSlices() ) {
gdata$set( gg, double(gene$GetNRowsInSlice(gg)));
}
return(.self);
},
update = function(ss, gg, astat) {
gmax = gdata$get(gg)
z1 = max.col(astat,ties.method="first");
z11 = astat[1:nrow(astat) + nrow(astat) * (z1 - 1)];
gmax = pmax(gmax, z11);
gdata$set(gg, gmax);
smax = sdata$get(ss)
z22 = apply(astat,2,max);
smax = pmax(smax, z22);
sdata$set(ss, smax);
return(.self);
},
updatecis = function(ss, gg, select.cis, astat) {
if(length(astat)>0)
{
byrows = aggregate(x=astat, by=list(row=select.cis[,1]), FUN=max);
bycols = aggregate(x=astat, by=list(col=select.cis[,2]), FUN=max);
gmax = gdata$get(gg);
gmax[byrows$row] = pmax(gmax[byrows$row], byrows$x)
gdata$set(gg, gmax);
smax = sdata$get(ss)
smax[bycols$col] = pmax(smax[bycols$col], bycols$x)
sdata$set(ss, smax);
}
return(.self);
},
getResults = function(snps, gene, pvfun) {
min.pv.snps = pvfun(sdata$unlist());
names(min.pv.snps) = rownames(snps);
min.pv.gene = pvfun(gdata$unlist());
names(min.pv.gene) = rownames(gene);
return(list(min.pv.snps = min.pv.snps, min.pv.gene = min.pv.gene));
}
))
.OutputSaver_FRD <- setRefClass(".OutputSaver_FRD",
fields = list(
sdata = ".listBuilder",
gdata = ".listBuilder",
cdata = ".listBuilder",
bdata = ".listBuilder",
fid = "list",
testfun1 = "list",
pvfun1 = "list"
),
methods = list(
initialize = function () {
sdata <<- .listBuilder$new();
gdata <<- .listBuilder$new();
cdata <<- .listBuilder$new();
bdata <<- .listBuilder$new();
fid <<- list(0);
testfun1 <<- list(0);
pvfun1 <<- list(0);
return(.self);
},
start = function(filename, statistic_name, unused1, unused2, testfun, pvfun) {
testfun1 <<- list(testfun);
pvfun1 <<- list(pvfun);
if(length(filename) > 0) {
if(class(filename) == "character") {
fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE);
} else {
fid <<- list(filename, FALSE)
}
writeLines( paste("SNP\tgene\t",statistic_name,"\tp-value\tFDR", sep = ""), fid[[1]]);
} else {
fid <<- list();
}
},
update = function(spos, gpos, sta, beta = NULL) {
if(length(sta)>0) {
sdata$add(spos);
gdata$add(gpos);
cdata$add(sta );
if(!is.null(beta ))
bdata$add(beta );
}
return(.self);
},
getResults = function( gene, snps, FDR_total_count) {
pvalues = NULL;
if(cdata$n > 0) {
tests = testfun1[[1]](cdata$unlist());
cdata <<- .listBuilder$new();
pvalues = pvfun1[[1]](tests);
ord = order(pvalues);
tests = tests[ord];
pvalues = pvalues[ord];
FDR = pvalues * FDR_total_count / (1:length(pvalues));
FDR[length(FDR)] = min(FDR[length(FDR)], 1);
FDR = rev(cummin(rev(FDR)));
snps_names = rownames(snps)[sdata$unlist()[ord]];
sdata <<- .listBuilder$new();
gene_names = rownames(gene)[gdata$unlist()[ord]];
gdata <<- .listBuilder$new();
beta = NULL;
if(bdata$n > 0)
beta = bdata$unlist()[ord];
if(length(fid)>0) {
step = 1000; ########### 100000
for( part in 1:ceiling(length(FDR)/step) ) {
fr = (part-1)*step + 1;
to = min(part*step, length(FDR));
dump = data.frame(snps_names[fr:to],
gene_names[fr:to],
if(is.null(beta)) tests[fr:to] else list(beta[fr:to],tests[fr:to]),
pvalues[fr:to],
FDR[fr:to],
row.names = NULL,
check.rows = FALSE,
check.names = FALSE,
stringsAsFactors = FALSE);
write.table(dump, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE);
}
}
} else {
cat("No significant associations were found.\n", file = if(length(fid)>0){fid[[1]]}else{""});
}
if(length(fid)>0) {
if(fid[[2]]) {
close(fid[[1]]);
}
fid <<- list();
}
if(!is.null(pvalues)) {
eqtls = list( snps = snps_names,
gene = gene_names,
statistic = tests,
pvalue = pvalues,
FDR = FDR);
if(!is.null(beta))
eqtls$beta = beta;
} else {
eqtls = list( snps = character(),
gene = character(),
beta = numeric(),
statistic = numeric(),
pvalue = numeric(),
FDR = numeric());
}
return(list(eqtls = data.frame(eqtls)));
}
)
)
.OutputSaver_direct <- setRefClass(".OutputSaver_direct",
fields = list(
gene_names = "character",
snps_names = "character",
fid = "list",
testfun1 = "list",
pvfun1 = "list"
),
methods = list(
initialize = function() {
gene_names <<- character(0);
snps_names <<- character(0);
fid <<- list(0);
testfun1 <<- list(0);
pvfun1 <<- list(0);
return(.self);
},
start = function(filename, statistic_name, snps, gene, testfun, pvfun) {
# I hope the program stops if it fails to open the file
if(class(filename) == "character") {
fid <<- list(file(description = filename, open = "wt", blocking = FALSE, raw = FALSE), TRUE);
} else {
fid <<- list(filename, FALSE)
}
writeLines(paste("SNP\tgene\t", statistic_name, "\tp-value", sep = ""), fid[[1]]);
gene_names <<- rownames(gene);
snps_names <<- rownames(snps);
testfun1 <<- list(testfun);
pvfun1 <<- list(pvfun);
},
update = function(spos, gpos, sta, beta = NULL) {
if( length(sta) == 0 )
return();
sta = testfun1[[1]](sta);
lst = list(snps = snps_names[spos], gene = gene_names[gpos], beta = beta, statistic = sta, pvalue = pvfun1[[1]](sta));
lst$beta = lst$beta;
dump2 = data.frame(lst, row.names = NULL, check.rows = FALSE, check.names = FALSE, stringsAsFactors = FALSE);
write.table(dump2, file = fid[[1]], quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE);
},
getResults = function(...) {
if(length(fid)>0) {
if(fid[[2]]) {
close(fid[[1]]);
}
fid <<- list();
}
gene_names <<- character(0);
snps_names <<- character(0);
return(list());
}
)
)
.my.pmin = function(x, val) {
# minimum "pmin" function that can handle empty array
if(length(x) == 0) {
return(x)
} else {
return(pmin.int(x,val));
}
}
.my.pmax = function(x, val) {
# minimum "pmin" function that can handle empty array
if(length(x) == 0) {
return(x)
} else {
return(pmax.int(x,val));
}
}
.pv.nz = function(x){return( .my.pmax(x,.Machine$double.xmin) )}
.SetNanRowMean = function(x) {
if( any(is.na(x)) ) {
rowmean = rowMeans(x, na.rm = TRUE);
rowmean[ is.na(rowmean) ] = 0;
for( j in which(!complete.cases(x)) ) {
where1 = is.na( x[j, ] );
x[j,where1] = rowmean[j];
}
}
return(x);
}
.SNP_process_split_for_ANOVA = function(x,n.groups) {
# split into 2 dummy variables (or more)
# # Get the number of ANOVA groups
# n.groups = options("MatrixEQTL.ANOVA.categories")[[1]];
# if( is.null(n.groups))
# n.groups = 3;
# Unique values in x (make sure it has length of n.groups);
uniq = unique(as.vector(x));
uniq = uniq[!is.na(uniq)];
if( length(uniq) > n.groups ) {
stop("More than declared number of genotype categories is detected by ANOVA");
} else if ( length(uniq) < n.groups ) {
uniq = c(uniq, rep(min(uniq)-1, n.groups-length(uniq)));
}
# Table of frequencies for each variable (row)
freq = matrix(0, nrow(x), n.groups);
for(i in 1:n.groups) {
freq[ ,i] = rowSums(x==uniq[i], na.rm = TRUE);
}
# remove NA-s from x for convenience
x[is.na(x)] = min(uniq)-2;
# Output list of matrices
rez = vector("list",n.groups-1);
# Skip the most frequent value
md = apply(freq, 1, which.max); # most frequent value for each variable
freq[ cbind(1:nrow(x),md) ] = -1;
# The rest form dumm
for(j in 1:(n.groups-1)){
md = apply(freq, 1, which.max);
freq[ cbind(1:nrow(x),md) ] = -1;
rez[[j]] = (x == uniq[md]);
}
return( rez );
}
Matrix_eQTL_engine2 = function(
snps,
gene,
cvrt = SlicedData$new(),
output_file_name = "",
pvOutputThreshold = 1e-5,
useModel = modelLINEAR,
errorCovariance = numeric(),
verbose = TRUE,
output_file_name.cis = "",
pvOutputThreshold.cis = 0,
snpspos = NULL,
genepos = NULL,
cisDist = 1e6,
pvalue.hist = FALSE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE,
addInfo = "") {
################################# Basic variable checks #################################
{
# status("Performing basic checks of the input variables");
stopifnot( "SlicedData" %in% class(gene) );
stopifnot( any(c("SlicedData","SlicedData.fmt") %in% class(snps)) );
stopifnot( "SlicedData" %in% class(cvrt) );
# Check dimensions
if( min(snps$nRows(),snps$nCols()) == 0 )
stop("Empty genotype dataset");
if( min(gene$nRows(),gene$nCols()) == 0 )
stop("Empty expression dataset");
if( snps$nCols() != gene$nCols() )
stop("Different number of samples in the genotype and gene expression files");
if( cvrt$nRows()>0 ) {
if( snps$nCols() != cvrt$nCols() )
stop("Wrong number of samples in the matrix of covariates");
}
stopifnot( class(pvOutputThreshold) == "numeric" );
stopifnot( length(pvOutputThreshold) == 1 );
stopifnot( pvOutputThreshold >= 0 );
stopifnot( pvOutputThreshold <= 1 );
stopifnot( class(noFDRsaveMemory) == "logical" );
stopifnot( length(noFDRsaveMemory) == 1 );
if( pvOutputThreshold > 0 ) {
stopifnot( !((length(output_file_name) == 0) && noFDRsaveMemory) )
stopifnot( length(output_file_name) <= 1 );
if( length(output_file_name) == 1 ) {
stopifnot( class(output_file_name) %in% c("character","connection") );
}
}
stopifnot( class(pvOutputThreshold.cis) == "numeric" );
stopifnot( length(pvOutputThreshold.cis) == 1 );
stopifnot( pvOutputThreshold.cis >= 0 );
stopifnot( pvOutputThreshold.cis <= 1 );
stopifnot( !((pvOutputThreshold > 0) & (pvOutputThreshold.cis > 0) & (pvOutputThreshold > pvOutputThreshold.cis)) );
stopifnot( (pvOutputThreshold > 0) | (pvOutputThreshold.cis > 0) );
stopifnot( class(useModel) == class(modelLINEAR) );
stopifnot( length(useModel) == 1 );
stopifnot( useModel %in% c(modelLINEAR, modelANOVA, modelLINEAR_CROSS) );
if( useModel %in% c(modelLINEAR, modelLINEAR_CROSS) ) {
if( snps$nCols() <= cvrt$nRows() + 1 + 1) {
stop("The number of covariates exceeds the number of samples.\nLinear regression can not be fit.")
}
}
if( useModel == modelLINEAR_CROSS ) {
if( cvrt$nRows() == 0 ) {
stop( "Model \"modelLINEAR_CROSS\" requires at least one covariate" );
}
}
if( useModel == modelANOVA ) {
n.anova.groups = getOption("MatrixEQTL.ANOVA.categories", 3);
stopifnot( n.anova.groups == floor(n.anova.groups) );
stopifnot( n.anova.groups >= 3 );
# stopifnot( n.anova.groups < snps$nCols() - cvrt$nRows() - 2 );
if( snps$nCols() <= cvrt$nRows() + n.anova.groups) {
stop("The number of covariates exceeds the number of samples.\nLinear regression (ANOVA) can not be fit.")
}
}
stopifnot( class(verbose) == "logical" );
stopifnot( length(verbose) == 1 );
stopifnot( class(min.pv.by.genesnp) == "logical" );
stopifnot( length(min.pv.by.genesnp) == 1 );
if( pvOutputThreshold.cis > 0 ) {
stopifnot( !((length(output_file_name.cis) == 0) && noFDRsaveMemory) )
stopifnot( length(output_file_name.cis) <= 1 );
if( length(output_file_name.cis) == 1 ) {
stopifnot( class(output_file_name.cis) %in% c("character","connection") );
}
# stopifnot( class(output_file_name.cis) == "character" );
# stopifnot( length(output_file_name.cis) == 1 );
stopifnot( class(snpspos) == "data.frame" );
stopifnot( ncol(snpspos) == 3 );
stopifnot( nrow(snpspos) > 0 );
stopifnot( class(snpspos[1,3]) %in% c("integer", "numeric") )
stopifnot( !any(is.na(snpspos[,3])) )
stopifnot( class(genepos) == "data.frame" );
stopifnot( ncol(genepos) == 4 );
stopifnot( nrow(genepos) > 0 );
stopifnot( class(genepos[1,3]) %in% c("integer", "numeric") )
stopifnot( class(genepos[1,4]) %in% c("integer", "numeric") )
stopifnot( !any(is.na(genepos[,3])) )
stopifnot( !any(is.na(genepos[,4])) )
stopifnot( nzchar(output_file_name.cis) )
}
if( pvOutputThreshold > 0 ) {
stopifnot( nzchar(output_file_name) )
}
stopifnot( class(errorCovariance) %in% c("numeric", "matrix") );
errorCovariance = as.matrix(errorCovariance);
if(length(errorCovariance)>0) {
if( nrow(errorCovariance) != ncol(errorCovariance) ) {
stop("The covariance matrix is not square");
}
if( nrow(errorCovariance) != snps$nCols() ) {
stop("The covariance matrix size does not match the number of samples");
}
if( !all(errorCovariance == t(errorCovariance)) ) {
stop("The covariance matrix is not symmetric");
}
}
}
################################# Initial setup #########################################
{
gene.std = .listBuilder$new();
snps.std = .listBuilder$new();
dont.clone.gene = getOption("MatrixEQTL.dont.preserve.gene.object", FALSE)
if(is.null(dont.clone.gene))
dont.clone.gene = FALSE;
if( !dont.clone.gene )
gene = gene$Clone();
# snps = snps$Clone(); # snps is read only
cvrt = cvrt$Clone();
params = list(
output_file_name = output_file_name,
pvOutputThreshold = pvOutputThreshold,
useModel = useModel,
errorCovariance = errorCovariance ,
verbose = verbose,
output_file_name.cis = output_file_name.cis,
pvOutputThreshold.cis = pvOutputThreshold.cis,
cisDist = cisDist ,
pvalue.hist = pvalue.hist,
min.pv.by.genesnp = min.pv.by.genesnp);
if( verbose ) {
lastTime = 0;
status <- function(text) {
# gc();
newTime = proc.time()[3];
if(lastTime != 0) {
cat("Task finished in ", newTime-lastTime, " seconds\n");
}
#cat(text,"\n");
lastTime <<- newTime;
unused = flush.console();
}
} else {
status = function(text){}
}
start.time = proc.time()[3];
}
################################# Error covariance matrix processing ####################
{
if( length(errorCovariance) > 0 ) {
status("Processing the error covariance matrix");
eig = eigen(errorCovariance, symmetric = TRUE)
d = eig$values;
v = eig$vectors;
# errorCovariance == v %*% diag(d) %*% t(v)
# errorCovariance^0.5 == v*sqrt(d)*v" (no single quotes anymore)
# errorCovariance^(-0.5) == v*diag(1./sqrt(diag(d)))*v"
if( any(d<=0) ) {
stop("The covariance matrix is not positive definite");
}
correctionMatrix = v %*% diag(1./sqrt(d)) %*% t(v);
rm( eig, v, d, errorCovariance )
} else {
rm( errorCovariance );
correctionMatrix = numeric();
}
}
################################# Matching gene and SNPs locations ######################
if( pvOutputThreshold.cis > 0 ) {
status("Matching data files and location files")
# names in the input data
gene_names = rownames(gene);
snps_names = rownames(snps);
# gene range, set: left<right
if(any(genepos[,3] > genepos[,4])) {
temp3 = genepos[,3];
temp4 = genepos[,4];
genepos[,3] = pmin(temp3,temp4);
genepos[,4] = pmax(temp3,temp4);
rm(temp3, temp4);
}
# match with the location data
genematch = match( gene_names, genepos[ ,1], nomatch = 0L);
usedgene = matrix(FALSE, nrow(genepos), 1); # genes in "genepos" that are matching "gene_names"
usedgene[ genematch ] = TRUE;
if( !any(genematch) ) {
stop("Gene names do not match those in the gene location file.");
}
cat( sum(genematch>0), "of", length(gene_names), " genes matched\n");
snpsmatch = match( snps_names, snpspos[ ,1], nomatch = 0L);
usedsnps = matrix(FALSE, nrow(snpspos),1);
usedsnps[ snpsmatch ] = TRUE;
if( !any(snpsmatch) ) {
stop("SNP names do not match those in the SNP location file.");
}
cat( sum(snpsmatch>0), "of", length(snps_names), " SNPs matched\n");
# list used chr names
chrNames = unique(c( as.character(unique(snpspos[usedsnps,2])),
as.character(unique(genepos[usedgene,2])) ))
chrNames = chrNames[ sort.list( suppressWarnings(as.integer(chrNames)),
method = "radix", na.last = TRUE ) ];
# match chr names
genechr = match(genepos[,2],chrNames);
snpschr = match(snpspos[,2],chrNames);
# max length of a chromosome
chrMax = max( snpspos[usedsnps, 3], genepos[usedgene, 4], na.rm = TRUE) + cisDist;
# Single number location for all rows in "genepos" and "snpspos"
genepos2 = as.matrix(genepos[ ,3:4, drop = FALSE] + (genechr-1)*chrMax);
snpspos2 = as.matrix(snpspos[ ,3 , drop = FALSE] + (snpschr-1)*chrMax);
# the final location arrays;
snps_pos = matrix(0,length(snps_names),1);
snps_pos[snpsmatch>0, ] = snpspos2[snpsmatch, , drop = FALSE];
snps_pos[rowSums(is.na(snps_pos))>0, ] = 0;
snps_pos[snps_pos==0] = (length(chrNames)+1) * (chrMax+cisDist);
rm(snps_names, snpsmatch, usedsnps, snpschr, snpspos2)
gene_pos = matrix(0,length(gene_names),2);
gene_pos[genematch>0, ] = genepos2[genematch, , drop = FALSE];
gene_pos[rowSums(is.na(gene_pos))>0, ] = 0;
gene_pos[gene_pos==0] = (length(chrNames)+2) * (chrMax+cisDist);
rm(gene_names, genematch, usedgene, genechr, genepos2)
rm(chrNames, chrMax);
if( is.unsorted(snps_pos) ) {
status("Reordering SNPs\n");
ordr = sort.list(snps_pos);
snps$RowReorder(ordr);
snps_pos = snps_pos[ordr, , drop = FALSE];
rm(ordr);
}
if( is.unsorted(rowSums(gene_pos)) ) {
status("Reordering genes\n");
ordr = sort.list(rowSums(gene_pos));
gene$RowReorder(ordr);
gene_pos = gene_pos[ordr, , drop = FALSE];
rm(ordr);
}
# Slice it back.
geneloc = vector("list", gene$nSlices())
gene_offset = 0;
for(gc in 1:gene$nSlices()) {
nr = gene$GetNRowsInSlice(gc);
geneloc[[gc]] = gene_pos[gene_offset + (1:nr), , drop = FALSE];
gene_offset = gene_offset + nr;
}
rm(gc, gene_offset, gene_pos);
snpsloc = vector("list", snps$nSlices())
snps_offset = 0;
for(sc in 1:snps$nSlices()) {
nr = snps$GetNRowsInSlice(sc);
snpsloc[[sc]] = snps_pos[snps_offset + (1:nr), , drop = FALSE];
snps_offset = snps_offset + nr;
}
rm(nr, sc, snps_offset, snps_pos);
}
################################# Covariates processing #################################
{
status("Processing covariates");
if( useModel == modelLINEAR_CROSS ) {
last.covariate = as.vector(tail( cvrt$getSlice(cvrt$nSlices()), n = 1));
}
if( cvrt$nRows()>0 ) {
cvrt$SetNanRowMean();
cvrt$CombineInOneSlice();
cvrt = rbind(matrix(1,1,snps$nCols()),cvrt$getSlice(1));
} else {
cvrt = matrix(1,1,snps$nCols());
}
# Correct for the error covariance structure
if( length(correctionMatrix)>0 ) {
cvrt = cvrt %*% correctionMatrix;
}
# Orthonormalize covariates
# status("Orthonormalizing covariates");
q = qr(t(cvrt));
if( min(abs(diag(qr.R(q)))) < .Machine$double.eps * snps$nCols() ) {
stop("Colinear or zero covariates detected");
}
cvrt = t( qr.Q(q) );
rm( q );
}
################################# Gene expression processing ############################
{
status("Processing gene expression data (imputation, residualization, etc.)");
# Impute gene expression
gene$SetNanRowMean();
# Correct for the error covariance structure
if( length(correctionMatrix)>0 ) {
gene$RowMatrixMultiply(correctionMatrix);
}
# Orthogonolize expression w.r.t. covariates
# status("Orthogonolizing expression w.r.t. covariates");
gene_offsets = double(gene$nSlices()+1);
for( sl in 1:gene$nSlices() ) {
slice = gene$getSlice(sl);
gene_offsets[sl+1] = gene_offsets[sl] + nrow(slice);
rowsq1 = rowSums(slice^2);
slice = slice - tcrossprod(slice,cvrt) %*% cvrt;
rowsq2 = rowSums(slice^2);
# kill rows colinear with the covariates
delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps );
slice[delete.rows,] = 0;
rowsq2[delete.rows] = 1;
div = sqrt(rowsq2); #sqrt( rowSums(slice^2) );
# div[ div == 0 ] = 1;
gene.std$set(sl, div);
gene$setSlice(sl, slice / div);
}
rm(rowsq1, rowsq2, delete.rows, div);
rm( sl, slice );
#gene$RowRemoveZeroEps();
}
################################# snps_process, testfun, pvfun, threshfun, afun ########
{
# snps_process - preprocess SNPs slice
#
# afun --- abs for signed stats, identity for non-negative
# threshfun --- internal stat threshold for given p-value
# testfun --- t or F statistic from the internal one
# pvfun --- p-value from the t or F statistic
nSamples = snps$nCols();
nGenes = gene$nRows();
nSnps = snps$nRows();
nCov = nrow(cvrt);
# nVarTested = length(snps_list); # set in case(useModel)
# dfNull = nSamples - nCov;
# d.f. of the full model
betafun = NULL;
if( useModel == modelLINEAR ) {
snps_process = function(x) {
return( list(.SetNanRowMean(x)) );
};
nVarTested = 1;
dfFull = nSamples - nCov - nVarTested;
statistic.fun = function(mat_list) {
return( mat_list[[1]] );
}
afun = function(x) {return(abs(x))};
threshfun = function(pv) {
thr = qt(pv/2, dfFull, lower.tail = FALSE);
thr = thr^2;
thr = sqrt( thr / (dfFull + thr) );
thr[pv >= 1] = 0;
thr[pv <= 0] = 1;
return( thr );
}
testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); }
pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2)); }
thresh.cis = threshfun(pvOutputThreshold.cis);
thresh = threshfun(pvOutputThreshold);
betafun = function(stat, ss, gg, select) {
return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]);
}
} else
if( useModel == modelANOVA ) {
snps_process = function(x).SNP_process_split_for_ANOVA(x,n.anova.groups);
nVarTested = n.anova.groups - 1;
dfFull = nSamples - nCov - nVarTested;
# statistic.fun = function(mat_list) {
# return( mat_list[[1]]^2 + mat_list[[2]]^2 );
# }
statistic.fun = function(mat_list) {
x = mat_list[[1]]^2;
for( j in 2:length(mat_list) )
x = x + mat_list[[j]]^2;
return( x );
}
afun = identity;
threshfun = function(pv) {
thr = qf(pv, nVarTested, dfFull, lower.tail = FALSE);
thr = thr / (dfFull/nVarTested + thr);
thr[pv >= 1] = 0;
thr[pv <= 0] = 1;
return( thr );
}
testfun = function(x) { return( x / (1 - .my.pmin(x,1)) * (dfFull/nVarTested) ); }
pvfun = function(x) { return( .pv.nz(pf(x, nVarTested, dfFull, lower.tail = FALSE)) ); }
thresh.cis = threshfun(pvOutputThreshold.cis);
thresh = threshfun(pvOutputThreshold);
} else
if( useModel == modelLINEAR_CROSS ) {
last.covariate = as.vector( last.covariate );
snps_process = .SNP_process_split_for_LINEAR_CROSS = function(x) {
out = vector("list", 2);
out[[1]] = .SetNanRowMean(x);
out[[2]] = t( t(out[[1]]) * last.covariate );
return( out );
};
nVarTested = 1;
dfFull = nSamples - nCov - nVarTested - 1;
statistic.fun = function(mat_list) {
return( mat_list[[2]] / sqrt(1 - mat_list[[1]]^2) );
}
afun = function(x) {return(abs(x))};
threshfun = function(pv) {
thr = qt(pv/2, dfFull, lower.tail = FALSE);
thr = thr^2;
thr = sqrt( thr / (dfFull + thr) );
thr[pv >= 1] = 0;
thr[pv <= 0] = 1;
return( thr );
}
testfun = function(x) { return( x * sqrt( dfFull / (1 - .my.pmin(x^2,1)))); }
pvfun = function(x) { return( .pv.nz(pt(-abs(x),dfFull)*2 )); }
thresh.cis = threshfun(pvOutputThreshold.cis);
thresh = threshfun(pvOutputThreshold);
betafun = function(stat, ss, gg, select) {
return(stat * gene.std$get(gg)[select[,1]] / snps.std$get(ss)[select[,2]]);
}
}
params$dfFull = dfFull;
}
################################# Saver class(es) creation ##############################
{
status("Creating output file(s)");
if(noFDRsaveMemory) {
if( pvOutputThreshold > 0 ) {
saver.tra = .OutputSaver_direct$new();
}
if( pvOutputThreshold.cis > 0 ) {
saver.cis = .OutputSaver_direct$new();
}
} else {
if( pvOutputThreshold > 0 ) {
saver.tra = .OutputSaver_FRD$new();
}
if( pvOutputThreshold.cis > 0 ) {
saver.cis = .OutputSaver_FRD$new();
}
}
if( pvOutputThreshold > 0 )
if( pvOutputThreshold * gene$nRows() * snps$nRows() > 1000000 )
if(!noFDRsaveMemory)
cat("Warning: pvOutputThreshold may be too large.\nExpected number of findings > ",
pvOutputThreshold * gene$nRows() * snps$nRows(),"\n");
if( (useModel == modelLINEAR) || (useModel == modelLINEAR_CROSS) ) {
statistic_name = "t-stat";
} else if( useModel == modelANOVA ) {
statistic_name = "F-test";
}
if(!is.null(betafun))
statistic_name = paste("beta\t",statistic_name, sep="");
if( pvOutputThreshold > 0 )
saver.tra$start(output_file_name, statistic_name, snps, gene, testfun, pvfun);
if( pvOutputThreshold.cis > 0 )
saver.cis$start(output_file_name.cis, statistic_name, snps, gene, testfun, pvfun);
rm( statistic_name );
}
################################# Some useful functions #################################
{
orthonormalize.snps = function(cursnps, ss) {
for(p in 1:length(cursnps)) {
if(length(correctionMatrix)>0) {
cursnps[[p]] = cursnps[[p]] %*% correctionMatrix;
}
rowsq1 = rowSums(cursnps[[p]]^2);
cursnps[[p]] = cursnps[[p]] - tcrossprod(cursnps[[p]],cvrt) %*% cvrt;
for(w in .seq(1L,p-1L))
cursnps[[p]] = cursnps[[p]] - rowSums(cursnps[[p]]*cursnps[[w]]) * cursnps[[w]];
rowsq2 = rowSums(cursnps[[p]]^2);
delete.rows = (rowsq2 <= rowsq1 * .Machine$double.eps );
cursnps[[p]][delete.rows,] = 0;
div = sqrt( rowsq2 );
div[ delete.rows ] = 1;
# show(c(rowsq2,rowsq1, div));
cursnps[[p]] = cursnps[[p]]/div;
}
snps.std$set(ss, div);
return(cursnps);
}
# if( pvOutputThreshold.cis > 0 ) {
# is.cis.pair = function(gg,ss) {
# return(!( ( snpsloc[[ss]][1, 1] - tail( geneloc[[gg]][ , 2], n = 1L) > cisDist) |
# ( geneloc[[gg]][1, 1] - tail( snpsloc[[ss]] , n = 1L) > cisDist) ) );
# }
# }
if( pvOutputThreshold.cis > 0 ) {
# sn.l = sapply(snpsloc, function(x)x[1] );
# sn.r = sapply(snpsloc, function(x)tail(x,1) );
# ge.l = sapply(geneloc, function(x)x[1,1] );
# ge.r = sapply(geneloc, function(x)x[nrow(x) , 2] );
sn.l = sapply(snpsloc, "[", 1 );
sn.r = sapply(snpsloc, tail, 1 );
ge.l = sapply(geneloc, "[", 1, 1 );
ge.r = sapply( lapply(geneloc, tail.matrix, 1 ), "[", 2);
gg.1 = findInterval( sn.l , ge.r + cisDist +1) + 1;
# cat(gg.1,"\n")
gg.2 = findInterval( sn.r , ge.l - cisDist );
# cat(gg.2,"\n")
rm(sn.l, sn.r, ge.l, ge.r);
}
}
################################# Prepare counters and histogram bins ###################
{
pvbins = NULL; # bin edges for p-values
statbins = 0; # bin edges for the test statistic (|t| or F)
do.hist = FALSE;
if( length(pvalue.hist) == 1 ) {
if(pvalue.hist == "qqplot") {
pvbins = c(0, 10^rev(seq(0, log10(.Machine$double.xmin)-1, -0.05)));
} else
if( is.numeric(pvalue.hist) ) {
pvbins = seq(from = 0, to = 1, length.out = pvalue.hist+1);
} else
if( pvalue.hist == TRUE ) {
pvbins = seq(from = 0, to = 1, length.out = 100+1);
}
} else
if( is.numeric(pvalue.hist) && (length(pvalue.hist) > 1) ) {
pvbins = pvalue.hist;
}
if( is.null(pvbins) && (pvalue.hist != FALSE) ) {
stop("Wrong value of pvalue.hist. Must be FALSE, TRUE, \"qqplot\", or numerical");
}
do.hist = !is.null(pvbins);
if( do.hist ) {
pvbins = sort(pvbins);
statbins = threshfun(pvbins);
if( pvOutputThreshold > 0) {
hist.all = .histogrammer$new(pvbins, statbins);
}
if( pvOutputThreshold.cis > 0) {
hist.cis = .histogrammer$new(pvbins, statbins);
}
}
rm( pvbins, statbins);
if(min.pv.by.genesnp) {
if( pvOutputThreshold > 0) {
minpv.tra = .minpvalue$new(snps,gene);
}
if( pvOutputThreshold.cis > 0) {
minpv.cis = .minpvalue$new(snps,gene);
}
}
}
################################# Main loop #############################################
{
beta = NULL;
n.tests.all = 0;
n.tests.cis = 0;
n.eqtls.tra = 0;
n.eqtls.cis = 0;
status("Performing eQTL analysis");
# ss = 1; gg = 1;
# ss = snps$nSlices(); gg = gene$nSlices();
snps_offset = 0;
for(ss in 1:snps$nSlices()) {
# for(ss in 1:min(2,snps$nSlices())) { #for debug
cursnps = NULL;
nrcs = snps$GetNRowsInSlice(ss);
# loop only through the useful stuff
for(gg in if(pvOutputThreshold>0){1:gene$nSlices()}else{.seq(gg.1[ss],gg.2[ss])} ) {
gene_offset = gene_offsets[gg];
curgene = gene$getSlice(gg);
nrcg = nrow(curgene);
if(nrcg == 0) next;
rp = "";
statistic = NULL;
select.cis.raw = NULL;
## do cis analysis
# if( (pvOutputThreshold.cis > 0) && ( is.cis.pair(gg, ss) ) ) {
if( (pvOutputThreshold.cis > 0) && (gg >= gg.1[ss]) && (gg <= gg.2[ss]) ) {
if( is.null( statistic ) ) {
if( is.null( cursnps ) ) {
cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss );
}
mat = vector("list", length(cursnps));
for(d in 1:length(cursnps)) {
mat[[d]] = tcrossprod(curgene, cursnps[[d]]);
}
statistic = statistic.fun( mat );
astatistic = afun(statistic);
# rm(mat);
}
# sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1 +1 , snpsloc[[ss]]);
# sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist -1 , snpsloc[[ss]]);
sn.l = findInterval(geneloc[[gg]][ ,1] - cisDist-1, snpsloc[[ss]]);
sn.r = findInterval(geneloc[[gg]][ ,2] + cisDist, snpsloc[[ss]]);
xx = unlist(lapply(which(sn.r>sn.l),FUN=function(x){(sn.l[x]:(sn.r[x]-1))*nrow(statistic)+x}))
select.cis.raw = xx[ astatistic[xx] >= thresh.cis ];
select.cis = arrayInd(select.cis.raw, dim(statistic))
n.tests.cis = n.tests.cis + length(xx);
n.eqtls.cis = n.eqtls.cis + length(select.cis.raw);
if( do.hist )
hist.cis$update(astatistic[xx]);
if( min.pv.by.genesnp ) {
# minpv.cis$updatecis(ss, gg, arrayInd(xx, dim(statistic)), astatistic[xx])
temp = double(length(astatistic));
dim(temp) = dim(astatistic);
temp[xx] = astatistic[xx];
minpv.cis$update(ss, gg, temp);
}
if(!is.null(betafun))
beta = betafun(mat[[length(mat)]][select.cis.raw], ss, gg, select.cis);
saver.cis$update( snps_offset + select.cis[ , 2],
gene_offset + select.cis[ , 1],
statistic[select.cis.raw],
beta);
# statistic.select.cis = statistic[ select.cis ];
# test = testfun( statistic.select.cis );
# pv = pvfun(test);
# Saver.cis$WriteBlock( cbind(snps_offset + select.cis[ , 2], gene_offset + select.cis[ , 1], test, pv) );
# counter.cis$Update(gg, ss, select.cis, pv, n.tests = length(xx), if(do.hist) afun(statistic[xx]) )
rp = paste(rp, ", ", formatC(n.eqtls.cis, big.mark=",", format = "f", digits = 0), " cis-eQTLs", sep = "");
}
## do trans/all analysis
if(pvOutputThreshold>0) {
if( is.null( statistic ) ) {
if( is.null( cursnps ) ) {
cursnps = orthonormalize.snps( snps_process( snps$getSlice(ss) ), ss );
}
mat = vector("list", length(cursnps));
for(d in 1:length(cursnps)) {
mat[[d]] = tcrossprod(curgene, cursnps[[d]]);
}
statistic = statistic.fun( mat );
astatistic = afun(statistic);
# rm(mat);
}
if( do.hist )
hist.all$update(astatistic);
if(!is.null(select.cis.raw))
astatistic[xx] = -1;
# select.tra.raw = select.tra.raw[!(select.tra.raw %in% select.cis.raw)];
select.tra.raw = which( astatistic >= thresh);
select.tra = arrayInd(select.tra.raw, dim(statistic))
n.eqtls.tra = n.eqtls.tra + length(select.tra.raw);
n.tests.all = n.tests.all + length(statistic);
if(!is.null(betafun))
beta = betafun(mat[[length(mat)]][select.tra.raw], ss, gg, select.tra);
saver.tra$update( snps_offset + select.tra[ , 2],
gene_offset + select.tra[ , 1],
statistic[select.tra.raw],
beta);
if( min.pv.by.genesnp )
minpv.tra$update(ss, gg, astatistic)
# statistic.select.tra = statistic[ select.tra ];
# test = testfun( statistic.select.tra );
# pv = pvfun( test );
# Saver$WriteBlock( cbind( snps_offset + select.tra[ , 2], gene_offset + select.tra[ , 1], test, pv) );
# counter$Update(gg, ss, select.tra, pv, n.tests = nrcs*nrcg, if(do.hist) afun(statistic) )
rp = paste(rp, ", ", formatC(n.eqtls.tra, big.mark=",", format = "f", digits = 0), " ", addInfo, sep = "")
}
#gene_offset = gene_offset + nrcg;
if( !is.null(statistic) ) {
per = 100*(gg/gene$nSlices() + ss-1) / snps$nSlices();
cat( formatC(floor(per*100)/100, format = "f", width = 5, digits = 2), "% done" , rp, "\n", sep = "");
flush.console();
}
} # gg in 1:gene$nSlices()
snps_offset = snps_offset + nrcs;
} # ss in 1:snps$nSlices()
}
################################# Results collection ####################################
{
rez = list(time.in.sec = proc.time()[3] - start.time);
rez$param = params;
if(pvOutputThreshold.cis > 0) {
rez.cis = list(ntests = n.tests.cis, neqtls = n.eqtls.cis);
rez.cis = c(rez.cis, saver.cis$getResults( gene, snps, n.tests.cis) );
if(do.hist)
rez.cis = c(rez.cis, hist.cis$getResults() );
if(min.pv.by.genesnp)
rez.cis = c(rez.cis, minpv.cis$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
}
if(pvOutputThreshold>0) {
rez.all = list(ntests = n.tests.all, neqtls = n.eqtls.tra + n.eqtls.cis);
if(pvOutputThreshold.cis > 0) {
rez.tra = list(ntests = n.tests.all - n.tests.cis, neqtls = n.eqtls.tra);
rez.tra = c(rez.tra, saver.tra$getResults( gene, snps, n.tests.all - n.tests.cis) );
} else {
rez.all = c(rez.all, saver.tra$getResults( gene, snps, n.tests.all ) );
}
if(do.hist) {
rez.all = c(rez.all, hist.all$getResults() );
if(pvOutputThreshold.cis > 0) {
rez.tra$hist.bins = rez.all$hist.bins;
rez.tra$hist.counts = rez.all$hist.counts - rez.cis$hist.counts;
}
}
if(min.pv.by.genesnp) {
if(pvOutputThreshold.cis > 0) {
rez.tra = c(rez.tra, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
} else {
rez.all = c(rez.all, minpv.tra$getResults(snps, gene, pvfun = function(x){pvfun(testfun(x))}) );
}
}
}
if(exists("rez.all")>0)
rez$all = rez.all;
if(exists("rez.tra")>0)
rez$trans = rez.tra;
if(exists("rez.cis")>0)
rez$cis = rez.cis;
class(rez) = c(class(rez),"MatrixEQTL");
status("");
}
# cat("s std ",snps.std$get(1),"\n");
# cat("g std ",gene.std$get(1),"\n");
################################# Results collection ####################################
return(rez);
}
.histme = function(m, name1, name2, ...) {
cnts = m$hist.counts;
bins = m$hist.bins;
ntst = m$ntests;
centers = 0.5 * (bins[-1L] + bins[-length(bins)]);
density = 0.5 / (bins[-1L] - centers) * cnts / ntst;
ntext = paste("Histogram for ", name1, formatC(ntst, big.mark=",", format = "f", digits = 0), name2, " p-values ",sep="");
r = structure(list(breaks = bins, counts = cnts, density = density,
mids = centers, equidist = FALSE), class = "histogram");
plot(r, main = ntext, ylab = "Density", xlab = "P-values", ...)
abline( h = 1, col = "blue");
return(invisible());
}
.qqme = function(m, lcol, cex, pch, ...) {
cnts = m$hist.counts;
bins = m$hist.bins;
ntst = m$ntests;
cusu = cumsum(cnts) / ntst;
ypos = bins[-1][is.finite(cusu)];
xpos = cusu[is.finite(cusu)];
lines(-log10(xpos), -log10(ypos), col = lcol, ...);
# lines(xpos, ypos, col = lcol, ...);
if(length(m$eqtls$pvalue)==0)
return();
ypvs = -log10(m$eqtls$pvalue);
xpvs = -log10(1:length(ypvs) / ntst);
if(length(ypvs) > 1000) {
# need to filter a bit, make the plotting faster
levels = as.integer( xpvs/xpvs[1] * 1e3);
keep = c(TRUE, diff(levels)!=0);
levels = as.integer( ypvs/ypvs[1] * 1e3);
keep = keep | c(TRUE, diff(levels)!=0);
ypvs = ypvs[keep];
xpvs = xpvs[keep];
rm(keep)
}
points(xpvs, ypvs, col = lcol, pch = pch, cex = cex, ...);
}
plot.MatrixEQTL = function(x, cex = 0.5, pch = 19, xlim = NULL, ylim = NULL, main = NULL, ...) {
# cat(class(main),'\n')
if( x$param$pvalue.hist == FALSE ) {
warning("Cannot plot p-value distribution: the information was not recorded.\nUse pvalue.hist!=FALSE.");
return(invisible());
}
if( x$param$pvalue.hist == "qqplot" ) {
xmin = 1/max(x$cis$ntests, x$all$ntests);
ymax = NULL;
if(!is.null(ylim)) {
ymax = ylim[2];
} else {
ymax = -log10(min(
x$cis$eqtls$pvalue[1], x$cis$hist.bins[ c(FALSE,x$cis$hist.counts>0)][1],
x$all$eqtls$pvalue[1], x$all$hist.bins[ c(FALSE,x$all$hist.counts>0)][1],
x$trans$eqtls$pvalue[1], x$trans$hist.bins[c(FALSE,x$trans$hist.counts>0)][1],
na.rm = TRUE))+0.1;
}
if(ymax == 0) {
ymax = -log10(.Machine$double.xmin)
}
if(!is.null(ymax))
ylim = c(0,ymax);
if(is.null(xlim))
xlim = c(0, -log10(xmin/1.5));
plot(numeric(),numeric(), xlab = "-Log10(p-value), theoretical",
ylab = "-Log10(p-value), observed",
xlim = c(0, -log10(xmin/1.5)),
ylim = ylim,
xaxs="i", yaxs="i", ...);
lines(c(0,1e3), c(0,1e3), col = "gray");
if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) {
.qqme( x$cis, "red", cex, pch, ...);
.qqme( x$trans, "blue", cex, pch, ...);
if(is.null(main)) {
main = paste("QQ-plot for",
formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0),
"local and",
formatC(x$trans$ntests, big.mark=",", format = "f", digits = 0),
"distant p-values");
}
lset = c(1,2,4);
} else
if(x$param$pvOutputThreshold.cis > 0) {
.qqme(x$cis, "red", cex, pch, ...);
if(is.null(main)) {
main = paste("QQ-plot for",
formatC(x$cis$ntests, big.mark=",", format = "f", digits = 0),
"local p-values");
}
lset = c(1,4);
} else {
.qqme(x$all, "blue", cex, pch, ...);
if(is.null(main)) {
main = paste("QQ-plot for all",
formatC(x$all$ntests, big.mark=",", format = "f", digits = 0),
"p-values");
}
lset = c(3,4);
}
title(main);
legend("topleft",
c("Local p-values","Distant p-values","All p-values","diagonal")[lset],
col = c("red","blue","blue","gray")[lset],
text.col = c("red","blue","blue","gray")[lset],
pch = 20, lwd = 1, pt.cex = c(1,1,1,0)[lset])
} else {
if((x$param$pvOutputThreshold > 0) && (x$param$pvOutputThreshold.cis > 0)) {
par(mfrow=c(2,1));
.histme(x$cis, "", " local", ...);
tran = list(hist.counts = x$all$hist.counts - x$cis$hist.counts,
hist.bins = x$all$hist.bins,
ntests = x$all$ntests - x$cis$ntests);
.histme(x$trans,""," distant", ...);
par(mfrow=c(1,1));
} else
if(x$param$pvOutputThreshold.cis > 0) {
.histme(x$cis, "", " local", ...);
} else {
.histme(x$all, "all ", "" , ...);
}
}
return(invisible());
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.