Nothing
# History Jul 02 2008 Initial coding
# Nov 12 2008 Add glu.mergeAlleles
# Dec 12 2008 Add glu.qcsummary
# Dec 16 2008 Add options list to glu.mergeAlleles
# Add function glu.renameAlleles
# Feb 02 2009 Update glu.qcsummary for samples
# Add function getMAF.control
# Feb 17 2009 Update getMAF.control for subsetting the phenotype data
# May 21 2009 Add glu.partition function
# Jun 10 2009 Update glu.partition for cases and controls
# Jul 07 2009 Add function glu.getCounts
# Jul 20 2009 Fix bug in glu.partition with obtaining ccLevels
# Jul 22 2009 Add error checks in glu.partition
# Aug 10 2009 Add glu.r2 function
# Oct 19 2009 Add glu.mergePhenoGeno function
# Nov 02 2009 NO CHANGE
# Dec 22 2009 Add glu.setUp function for
# Dec 28 2009 Put glu.setUp in wga_util2.R
# Add options for glu.partition
# Jan 11 2010 Update temporary files in glu.partition
# Jan 14 2010 In glu.partition, allow data to be passed in
# Sep 13 2010 Add function for obtaining r2 matrix
# Oct 18 2010 Add function for getting the snps in genes
# Oct 19 2010 Add function for retuning the number of bins using GLU
# Oct 22 2010 Add function for returning the snps in bins based on
# an R^2 threshold and MAF.
# Oct 25 2010 Change option names in glu.nBins
# Nov 12 2010 Add function to convert to a ped file for Haploview
# Nov 16 2010 Change options in ldmatrix and add function to add missing snps
# Jan 21 2011 Convert mat to a matrix in addMissingSNPs
# Sep 12 2013 Add function to find snps in LD with a set of snps
# Sep 13 2013 Add function to extract snps
# Sep 19 2013 Add GLU option and info.list to create_ped function
# Function to open a connection and transfrom data
glu.transform <- function(infile, inFormat=NULL, outFormat="ldat") {
# infile Path to file to open a connection
# No default
# inFormat GLU format of infile
# The default is NULL
# outFormat The GLU format of the transformed data
# The default is "ldat".
command <- "glu transform"
if (!is.null(inFormat)) command <- paste(command, " -f ", inFormat, sep="")
command <- paste(command, " ", infile, sep="")
if (!is.null(outFormat)) command <- paste(command, " -F ", outFormat, sep="")
fid <- pipe(command, open="r")
} # END: glu.transform
# Function for changing the alleles using GLU
glu.mergeAlleles <- function(snp1.list, snp2.list, temp.list=NULL,
op=NULL) {
# snp1.list and snp2.list can also have the field "qcsummary".
# snp1.list Base genotype data list
# snp2.list
# temp.list
###########################################################
# op List with names:
# outfile Updated genotype file for snp2.list
# include.loci
# flip2 0 or 1 for flipping alleles in snp2.list
###########################################################
# Check the options list
op <- default.list(op,
c("outfile", "flip2"), list("out.txt", 0))
outfile <- op$outfile
flip2 <- op$flip2
include.loci <- getListName(op, "include.loci")
rm(op)
temp <- gc(verbose=FALSE)
temp.list <- check.temp.list(temp.list)
snp1.list <- check.snp.list(snp1.list)
snp2.list <- check.snp.list(snp2.list)
# Check if the qc.summary files exist
tmp1 <- getListName(snp1.list, "qcsummary")
if (is.null(tmp1)) {
tmp1 <- getTempfile(temp.list$dir, prefix="snp1", ext=".txt")
temp <- glu.qcsummary(snp1.list, out.loci=tmp1, include.loci=include.loci)
}
tmp2 <- getListName(snp2.list, "qcsummary")
if (is.null(tmp2)) {
tmp2 <- getTempfile(temp.list$dir, prefix="snp2", ext=".txt")
temp <- glu.qcsummary(snp2.list, out.loci=tmp2, include.loci=include.loci)
}
# Get the snp names and alleles
vars <- c("LOCUS", "NUM_ALLELES", "ALLELES")
tlist <- list(file=tmp1, file.type=3, delimiter="\t", header=1)
s1 <- getColumns(tlist, vars, temp.list=temp.list)
tlist$file <- tmp2
s2 <- getColumns(tlist, vars, temp.list=temp.list)
names(s1$ALLELES) <- s1$LOCUS
names(s1$NUM_ALLELES) <- s1$LOCUS
names(s2$ALLELES) <- s2$LOCUS
names(s2$NUM_ALLELES) <- s2$LOCUS
snps <- intersect(s1$LOCUS, s2$LOCUS)
# Remove snps = "*" (last row of qc.summary file)
snps <- snps[snps != "*"]
if (!length(snps)) {
print("No intersecting SNPs")
return(0)
}
a1 <- s1$ALLELES[snps]
a2 <- s2$ALLELES[snps]
n1 <- as.numeric(s1$NUM_ALLELES[snps])
n2 <- as.numeric(s2$NUM_ALLELES[snps])
# Delete temporary files
if (temp.list$delete) {
file.remove(tmp1)
file.remove(tmp2)
}
rm(s1, s2, tmp1, tmp2)
temp <- gc()
# Check for more than 2 alleles
temp <- (n1 > 2) + (n2 > 2)
if (any(temp)) {
stop("NUM_ALLELES > 2 for some SNPs")
}
# Seperate the alleles
nsnp <- length(a1)
mat1 <- matrix(data="", nrow=nsnp, ncol=2)
mat2 <- matrix(data="", nrow=nsnp, ncol=2)
mat1[,1] <- substr(a1, 1, 1)
mat1[,2] <- substr(a1, 3, 3)
mat2[,1] <- substr(a2, 1, 1)
mat2[,2] <- substr(a2, 3, 3)
rm(a1, a2)
temp <- gc()
# Create a temporary file for the allele re-naming
tmp <- getTempfile(temp.list$dir, prefix="allele", ext=".txt")
# Open and initialize the file
temp <- c("SNP", "OLD_ALLELES", "NEW_ALLELES")
fid <- writeVecToFile(temp, tmp, colnames=NULL, type=3, close=0,
sep="\t")
reverse <- 2:1
# Loop over each snp
for (i in 1:nsnp) {
n2i <- n2[i]
n1i <- n1[i]
if (!n2i) next
m1 <- mat1[i, 1:n1i]
m2 <- mat2[i, 1:n2i]
# See if the alleles match
flag <- (all(m2 %in% m1)) || (all(m1 %in% m2))
# If they agree and there is no flip, then do nothing
if (flag) {
if (!flip2) next
# Alleles agree and there is a flip
if (n2i == 2) {
old <- paste(m2, collapse=",", sep="")
new <- paste(m2[reverse], collapse=",", sep="")
} else {
old <- m2[1]
if (n1i == 2) {
# Get the other allele
if (old == m1[1]) {
new <- m1[2]
} else {
new <- m1[1]
}
} else if (n1i == 1) {
new <- m1[1]
if (new == old) next
} else {
# n1[i] is 0. Do nothing
next
}
}
} else {
if (!n1i) next
# Alleles do not match
old <- paste(m2, collapse=",", sep="")
new <- paste(m1, collapse=",", sep="")
if (n1i == 1) {
new <- m1[1]
old <- m2[1]
} else if (n2i == 1) {
new <- m1[1]
}
if (old == new) next
}
# Write to file
temp <- c(snps[i], old, new)
write(temp, file=fid, ncolumns=3, sep="\t")
}
close(fid)
# Rename the alleles
temp <- glu.renameAlleles(snp2.list, tmp, outfile=outfile)
# Delete temporary file
if (temp.list$delete) file.remove(tmp)
0
} # END: glu.mergeAlleles
# Function for calling ginfo
glu.ginfo <- function(snp.list, outloci="out.txt") {
snp.list <- check.snp.list(snp.list)
type <- snp.list$file.type
if (type == 2) type <- "ldat"
if (type == 3) type <- "sdat"
snp <- paste(snp.list$dir, snp.list$file, sep="")
command <- paste("glu ginfo --outputloci=", outloci, sep="")
command <- paste(command, " -f ", type, " ", snp, ":unique=n", sep="")
ret <- callOS(command)
ret
} # END: glu.ginfo
# Function to get the common snps
glu.commonSNPs <- function(slist, temp.list=NULL, outfile=NULL) {
# slist List of sublists of type snp.list
temp.list <- check.temp.list(temp.list)
tmpfile <- getTempfile(temp.list$dir, prefix="snp", ext=".txt")
tlist <- list(file=tmpfile, file.type=3, delimiter="\t", header=1)
n <- length(slist)
for (i in 1:n) {
temp <- glu.ginfo(slist[[i]], outloci=tmpfile)
temp <- getColumns(tlist, "LOCUS", temp.list=temp.list)
if (i == 1) {
snps <- temp[[1]]
} else {
snps <- intersect(snps, temp[[1]])
}
}
snps <- unique(snps)
if (!is.null(outfile)) write(snps, file=outfile, ncolumns=1)
snps
} # END: glu.commonSNPs
# Function to call qc.summary
glu.qcsummary <- function(snp.list, out.loci="out.txt", include.loci=NULL,
include.samples=NULL) {
snp.list <- check.snp.list(snp.list)
snp <- paste(snp.list$dir, snp.list$file, sep="")
command <- paste("glu qc.summary --locusout=", out.loci, sep="")
command <- paste(command, " ", snp, sep="")
if (!is.null(include.loci)) {
command <- paste(command, " --includeloci=", include.loci, sep="")
}
if (!is.null(include.samples)) {
command <- paste(command, " --includesamples=", include.samples, sep="")
}
ret <- callOS(command)
ret
} # END: glu.qcsummary
# Function to rename alleles
glu.renameAlleles <- function(snp.list, alleleFile, outfile="out.txt") {
snp.list <- check.snp.list(snp.list)
type <- snp.list$file.type
if (type == 2) type <- "ldat"
if (type == 3) type <- "sdat"
snp <- paste(snp.list$dir, snp.list$file, sep="")
command <- paste("glu transform -o ", outfile, sep="")
command <- paste(command, " -f ", type, sep="")
command <- paste(command, " --renamealleles=", alleleFile, sep="")
command <- paste(command, " ", snp, sep="")
ret <- callOS(command)
ret
} # END: glu.renameAlleles
# Function to compute the MAF in controls using GLU
getMAF.control <- function(snp.list, pheno.list, data, op=NULL, temp.list=NULL) {
# Check the lists
snp.list <- check.snp.list(snp.list)
pheno.list <- check.pheno.list(pheno.list)
temp.list <- check.temp.list(temp.list)
# Get the subset of controls
x <- read.table(pheno.list$file, header=pheno.list$header, sep=pheno.list$delimiter)
temp <- getListName(pheno.list, "subsetData")
if (!is.null(temp)) {
x <- subsetData.list(x, temp)
}
vars <- c(pheno.list$id.var, pheno.list$response.var)
x <- removeOrKeepCols(x, vars, which=1)
x <- unfactor.all(x)
# Get the controls
temp <- x[, pheno.list$response.var] == 0
x <- removeOrKeepRows(x, temp, which=1)
# Create a temporary file for the subject ids
tmp <- getTempfile(temp.list$dir, prefix="samples", ext=".txt")
# Write the samples ids
write(x[, pheno.list$id.var], file=tmp, ncolumns=1)
# Create a temporary file for the snp ids
tmp2 <- getTempfile(temp.list$dir, prefix="snps", ext=".txt")
# Write the samples ids
write(data[, "SNP"], file=tmp2, ncolumns=1)
rm(x, temp, vars)
temp <- gc(verbose=FALSE)
# Create a temporary file for GLU output
gluout <- getTempfile(temp.list$dir, prefix="glu", ext=".txt")
# Call GLU
temp <- glu.qcsummary(snp.list, out.loci=gluout, include.loci=tmp2,
include.samples=tmp)
# Add the MAF column
temp <- list(file=gluout, file.type=3, delimiter="\t", header=1, id.var="LOCUS", vars="MAF", names="MAF")
data <- addColumn(data, "SNP", temp)
# Delete files
if (temp.list$delete) {
file.remove(tmp)
file.remove(gluout)
file.remove(tmp2)
}
data
} # END: getMAF.control
# Function to get info on which snps belong to which study
glu.partition <- function(snp.list, pheno.list, temp.list=NULL, op=NULL) {
# Check the lists
op <- default.list(op, c("byCC"), list(0), error=c(0))
snp.list <- check.snp.list(snp.list)
pheno.list <- check.pheno.list(pheno.list)
temp.list <- check.temp.list(temp.list)
pheno.list <- default.list(pheno.list, c("partition.var", "is.the.data"),
list("ERROR", 0), error=c(1, 0))
idVal <- temp.list$id
if (temp.list$dir %in% c("", " ")) temp.list$dir <- "./"
# Check if byCC option was specified
byCC <- op$byCC
if (byCC) {
temp <- pheno.list[["cc.var", exact=TRUE]]
if (is.null(temp)) {
stop("ERROR: cc.var not specified in pheno.list")
}
} else {
ccLevels <- 1
}
pvar <- pheno.list$partition.var
id <- pheno.list$id.var
x <- readTable(pheno.list$file, pheno.list)
pv <- makeVector(unfactor(x[, pvar]))
id <- makeVector(unfactor(x[, id]))
if (byCC) {
cv <- makeVector(unfactor(x[, pheno.list$cc.var]))
ccLevels <- unique(cv)
}
nr <- nrow(x)
rm(x)
gc()
# Get the partition levels
levels <- unique(pv)
# Create a temporary file for the subject ids
tmpfile <- getTempfile(temp.list$dir, prefix=paste("samples_", idVal, sep=""), ext=".txt")
# Create a temporary file for the qc output
tmpqc <- getTempfile(temp.list$dir, prefix=paste("qcout_", idVal, sep=""), ext=".txt")
# GLU variables
vars <- c("NUM_GENOTYPES", "GENOTYPE_COUNTS")
# Loop over the levels
x <- NULL
for (lev in levels) {
print(lev)
temp0 <- (pv == lev)
temp0[is.na(temp0)] <- FALSE
for (cc in ccLevels) {
if (byCC) {
temp <- temp0 & (cv == cc)
temp[is.na(temp)] <- FALSE
} else {
temp <- temp0
}
s <- sum(temp)
print(s)
if (!s) next
# Write out the ids
write(id[temp], file=tmpfile, ncolumns=1)
# Call qc.summary
temp <- glu.qcsummary(snp.list, out.loci=tmpqc, include.loci=NULL,
include.samples=tmpfile)
# New variable names
vname <- paste(pvar, ".", lev, sep="")
new <- c(vname, paste(vname, ".COUNTS", sep=""))
if (byCC) new <- paste(new, ".", cc, sep="")
# Add the info
if (is.null(x)) {
tlist <- list(what="character", returnMatrix=1, include.row1=1, delimiter="\t")
x <- scanFile(tmpqc, tlist)
x <- x[, c("LOCUS", "NUM_GENOTYPES", "GENOTYPE_COUNTS")]
x <- data.frame(x)
x <- unfactor.all(x)
x <- renameVar(x, "LOCUS", "SNP")
x <- renameVar(x, vars[1], new[1])
x <- renameVar(x, vars[2], new[2])
x[, new[1]] <- as.numeric(x[, new[1]])
partVars <- new[1]
} else {
tlist <- list(file=tmpqc, file.type=3, header=1, delimiter="\t",
id.var="LOCUS", vars=vars, names=new, type=c("N", "C"))
x <- addColumn(x, "SNP", tlist)
partVars <- c(partVars, new[1])
}
# Delete files
file.remove(tmpfile)
file.remove(tmpqc)
} # END: for (cc in ccLevels)
} # END: for (lev in levels
# Determine which snps belonged to all studies
v <- "ALL_PARTS"
x[, v] <- 1
for (var in partVars) x[, v] <- as.numeric(x[, v] & (x[, var] > 0))
out <- op[["outfile", exact=TRUE]]
if (!is.null(out)) writeTable(x, out)
# Return the SNPs
temp <- x[, v] == 1
temp[is.na(temp)] <- FALSE
snps <- makeVector(x[temp, "SNP"])
snps
} # END: glu.partition
# Function to get (study specific geno counts)
glu.getCounts <- function(snp.list, pheno.list, temp.list=NULL, op=NULL) {
# op List with names
# byCC 0 or 1
# The default is 1
# by.var Name of variable on phenotype data for seperate counts
# The default is NULL
# snps List of type file.list or a character vector of snps
# The default is NULL
# Check lists
snp.list <- check.snp.list(snp.list)
pheno.list <- check.pheno.list(pheno.list)
temp.list <- check.temp.list(temp.list)
op <- default.list(op, c("byCC"), list(1))
# Get the by variable
byVar <- getListName(op, "by.var")
if (is.null(byVar)) {
byFlag <- 0
} else {
byFlag <- 1
}
# Get the case-control variable
ccVar <- getListName(pheno.list, "cc.var")
if (is.null(ccVar)) op$byCC <- 0
ccFlag <- op$byCC
# Get the snps
snpFlag <- 1
snps <- getListName(op, "snps")
if (is.null(snps)) snpFlag <- 0
if (snpFlag) {
if (typeof(snps) != "character") {
snps <- scan(snps$file, what="character", sep="\n")
}
# Write these snps out to a temporary file
tmpfile <- getTempfile(temp.list$dir, prefix="snps", ext=".txt")
write(snps, file=tmpfile, ncolumns=1)
rm(snps)
gc()
}
# Create temporary file for GLU output
tmpglu <- getTempfile(temp.list$dir, prefix="gluOut", ext=".txt")
# Create string
str <- paste("glu transform -F ldat -o ", tmpglu, sep="")
if (snpFlag) str <- paste(str, " --includeloci=", tmpfile, sep="")
str <- paste(str, " ", snp.list$file, sep="")
print(str)
temp <- callOS(str)
# Update snp.list
snp.list$file <- tmpglu
snp.list$file.type <- 2
snp.list$stream <- 0
pheno.list$keep.vars <- c(pheno.list$id.var, ccVar, byVar)
# Get the data
tlist <- list(include.row1=0, include.snps=0, return.type=1, MAF=0,
missing=1, snpNames=1, orderByPheno=1, return.pheno=1)
temp <- try(getData.1(snp.list, pheno.list, temp.list, op=tlist),
silent=TRUE)
if (class(temp) == "try-error") {
print(temp)
stop("ERROR loading data")
}
snpData <- temp$data
#missing <- temp$missing
snpNames <- temp$snpNames
delimiter <- "\t"
nsnps <- length(snpData)
# Get the phenotype data
phenoData.list <- temp$phenoData.list
phenoData0 <- phenoData.list$data
nr <- nrow(phenoData0)
if (ccFlag) cc <- makeVector(phenoData0[, ccVar])
if (byFlag) by <- makeVector(phenoData0[, byVar])
rm(temp, phenoData.list, phenoData0)
gc()
# Remove temporary files
file.remove(tmpfile)
file.remove(tmpglu)
# Get the unique levels of ccVar and byVar
if (ccFlag) {
ccLevels <- unique(cc)
ncc <- length(ccLevels)
} else {
ncc <- 1
}
if (byFlag) {
byLevels <- unique(by)
nby <- length(byLevels)
} else {
nby <- 1
}
# Initialize return data frame
ret <- data.frame(snpNames)
colnames(ret) <- "SNP"
rm(snpNames)
gc()
# Loop over each level
for (i in 1:nby) {
if (byFlag) {
print(byLevels[i])
tempBy <- by == byLevels[i]
tempBy[is.na(tempBy)] <- FALSE
} else {
tempBy <- rep(TRUE, times=nr)
}
for (j in 1:ncc) {
if (byFlag) {
print(ccLevels[j])
tempCC <- cc == ccLevels[j]
tempCC[is.na(tempCC)] <- FALSE
} else {
tempCC <- rep(TRUE, times=nr)
}
# Get the correct subset
rows <- tempBy & tempCC
# Get the new variable name
if (byFlag) new <- byLevels[i]
if (ccFlag) {
new <- paste(new, ".", ccLevels[j], sep="")
}
ret[, new] <- NA
# Loop over each SNP
for (k in 1:nsnps) {
snp <- as.integer(getVecFromStr(snpData[k], delimiter=delimiter))
counts <- getGenoCounts(snp[rows], exclude=c(NA, NaN), check=1)
ret[k, new] <- paste(counts, collapse="|", sep="")
}
}
} # END: for (i in 1:nby)
# Output file
out <- getListName(op, "outfile")
if (!is.null(out)) writeTable(ret, out)
ret
} # END: glu.getCounts
# Function to get r2 and dprime.
# Subjects to use is determined by pheno.list$subsetData. Set pheno.list
# to use all subjects in the genotype data.
# Use snp.list$snpNames to specify the snps
glu.r2 <- function(snp.list, pheno.list=NULL, op=NULL) {
# op List with names
# outfile
# Check lists
snp.list <- check.snp.list(snp.list)
if (!is.null(pheno.list)) {
pheno.list <- check.pheno.list(pheno.list)
pflag <- 1
} else {
pflag <- 0
}
op <- default.list(op, c("outfile"), list("ERROR"), error=1)
temp.list <- getListName(op, "temp.list")
temp.list <- check.temp.list(temp.list)
# Write out subject ids
if (pflag) {
temp <- getPhenoData(pheno.list, temp.list=temp.list)$data
tfile <- getTempfile(temp.list$dir, prefix="subs_r2_", ext=".txt")
temp <- makeVector(temp[, pheno.list$id.var])
write(temp, file=tfile, ncolumns=1)
}
# Write out snps
snps <- getListName(snp.list, "snpNames")
sflag <- !is.null(snps)
if (sflag) {
snpfile <- getTempfile(temp.list$dir, prefix="SNP_r2_", ext=".txt")
write(snp.list$snpNames, file=snpfile, ncolumns=1)
snpStr <- paste(" --includeloci=", snpfile, sep="")
} else {
snpStr <- paste(" --includeloci=", snp.list$snpNames.list$file, sep="")
}
# Call GLU
str <- "glu tagzilla --skipbinning -r 0"
str <- paste(str, " --saveldpairs=", op$outfile, sep="")
str <- paste(str, snpStr, sep="")
if (pflag) str <- paste(str, " --includesamples=", tfile, sep="")
str <- paste(str, " ", snp.list$file, sep="")
callOS(str)
# Delete file
if (temp.list$delete) {
if (sflag) file.remove(snpfile)
if (pflag) file.remove(tfile)
}
# Read in and edit the glu output
x <- read.table(op$outfile, header=1, sep="\t", as.is=FALSE)
x <- unfactor.all(x)
temp <- x[, "LNAME1"] == x[, "LNAME2"]
temp[is.na(temp)] <- FALSE
x <- removeOrKeepRows(x, temp, which=-1)
writeTable(x, op$outfile)
x
} # END: glu.r2
# Function to merge phenotype and genotype data
glu.mergePhenoGeno <- function(snp.list, pheno.list, op=NULL) {
# op List of options for mergePhenoGeno and temp.list
# outfile
# which 0-2 for the coding
snp.list <- check.snp.list(snp.list)
pheno.list <- check.pheno.list(pheno.list)
temp.list <- getListName(op, "temp.list")
temp.list <- check.temp.list(temp.list)
snp.list <- default.list(snp.list, "snpNames", list("ERROR"), error=1)
# Temp file for ldat and snpNames
tmpfile <- getTempfile(temp.list$dir, prefix="SNP", ext=".ldat")
tmp <- getTempfile(temp.list$dir, prefix="SNPNAMES", ext=".txt")
temp <- getListName(snp.list, "snpNames")
write(temp, file=tmp, ncolumns=1)
str <- "glu transform -F ldat -o "
str <- paste(str, tmpfile, sep="")
str <- paste(str, " --includeloci=", tmp, sep="")
str <- paste(str, " --orderloci=", tmp, sep="")
str <- paste(str, " ", snp.list$file, sep="")
callOS(str)
snp.list$file <- tmpfile
snp.list$file.type <- 2
snp.list$delimiter <- "\t"
x <- mergePhenoGeno(snp.list, pheno.list, temp.list=temp.list, op=op)
if (temp.list$delete) {
file.remove(tmpfile)
file.remove(tmp)
}
x
} # END: glu.mergePhenoGeno
# Function to add missing SNPs in ld matrix
addMissingSNPs <- function(snps, mat) {
n <- length(snps)
nr <- nrow(mat)
if (n == nr) return(mat)
cnames <- colnames(mat)
temp <- !(snps %in% cnames)
miss <- snps[temp]
nmiss <- length(miss)
mat <- as.matrix(mat)
colnames(mat) <- NULL
rownames(mat) <- NULL
mat <- rbind(mat, matrix(0, nrow=nmiss, ncol=nr))
mat <- cbind(mat, matrix(0, nrow=nrow(mat), ncol=nmiss))
cnames <- c(cnames, miss)
rownames(mat) <- cnames
colnames(mat) <- cnames
diag(mat) <- 1
mat
}
# Function for returning an ld matrix
glu.ldMatrix <- function(snps, genoFile, op=NULL) {
temp.list <- op[["temp.list", exact=TRUE]]
temp.list <- check.temp.list(temp.list)
op <- default.list(op, c("maxdist", "minmaf", "measure", "order", "addMiss"),
list(200, 0.05, "r2", 1, 1))
# Determine where it is running
#host <- callOS("hostname", inter=TRUE)
#if (host == "biowulf.nih.gov) {
# bioFlag <- 1
#} else {
# bioFlag <- 0
#}
snps <- unique(snps)
nsnps <- length(snps)
if (nsnps == 1) {
x <- matrix(data=1.0, nrow=1, ncol=1)
rownames(x) <- snps
colnames(x) <- snps
return(x)
}
# Write snps to a temporary file
tempfile <- getTempfile(temp.list$dir, prefix=paste("_tmplist_", temp.list$id, "_", sep=""), ext=".txt")
write(snps, file=tempfile, ncolumns=1)
# Temporary output file
tempout <- getTempfile(temp.list$dir, prefix=paste("_tmpout_", temp.list$id, "_", sep=""), ext=".txt")
# Create the glu call command
str <- paste("glu ld.matrix --maxdist=", op$maxdist, " --minmaf=", op$minmaf,
" --measure=", op$measure, " --includeloci=", tempfile,
" --output=", tempout, sep="")
# Loci description file
#temp <- op[["loci.file", exact=TRUE]]
#if ((is.null(temp)) && ())
str <- paste(str, " ", genoFile, sep="")
callOS(str)
# Read in the matrix
x <- read.table(tempout, as.is=TRUE, header=1, row.names=1, sep="\t")
n <- ncol(x)
for (i in 1:(n-1)) {
cols <- (i+1):n
x[i, cols] <- x[cols, i]
}
temp <- is.na(x)
x[temp] <- 0
# Delete temp files
if (temp.list$delete) {
file.remove(tempfile)
file.remove(tempout)
}
# Add missing snps if needed
if (op$addMiss) x <- addMissingSNPs(snps, x)
# Check for error
if (nsnps != nrow(x)) stop("ERROR in glu.ldMatrix: incorrect dimension of LD matrix")
# Order the same as the input list of snps
if (op$order) x <- x[snps, snps]
x
} # glu.ldMatrix
# Function for getting the snps in a list of genes
glu.snps_in_genes <- function(gene.list, op=NULL) {
keep <- c("LOCUS", "CHROMOSOME", "LOCATION", "FEATURE_NAME", "FEATURE_TYPE")
op <- default.list(op, c("d", "u", "keep.vars"), list(70000, 70000, keep))
temp.list <- op[["temp.list", exact=TRUE]]
temp.list <- check.temp.list(temp.list)
dir <- temp.list$dir
id <- temp.list$id
genoFile <- op[["geno.file", exact=TRUE]]
genoFlag <- !is.null(genoFile)
tmpfile <- getTempfile(dir, paste("genedb_", id, sep=""), ext=".txt")
tmpgene <- getTempfile(dir, paste("genes_", id, sep=""), ext=".txt")
write(gene.list, file=tmpgene, ncolumns=1)
str <- paste("glu genedb.find_snps -u ", op$u, " -d ", op$d,
" -o ", tmpfile, " ", tmpgene, sep="")
print(str)
callOS(str)
x <- loadData.table(tmpfile)
keep <- op[["keep.vars", exact=TRUE]]
if (!is.null(keep)) x <- x[, keep]
if (genoFlag) {
str <- paste("glu ginfo --outputloci=", tmpfile, ":c=1 ", genoFile, sep="")
print(str)
callOS(str)
snps <- scan(tmpfile, what="character", sep="\n")
temp <- x[, "LOCUS"] %in% snps
x <- x[temp, ]
rm(snps)
gc()
}
if (temp.list$delete) {
file.remove(tmpfile)
file.remove(tmpgene)
}
temp <- op[["out.file", exact=TRUE]]
if (!is.null(temp)) writeTable(x, temp)
x
} # END: glu.snps_in_genes
# Function for returning the number of bins
glu.nBins <- function(snps, genoFile, op=NULL) {
op <- default.list(op, c("r2.threshold", "min.MAF", "temp.list", "return.data", "snpsIsFile", "samplesIsFile"),
list(0.8, 0.05, list(), 0, 0, 0))
temp.list <- check.temp.list(op$temp.list)
snpFlag <- op$snpsIsFile
if (!snpFlag) {
snps <- unique(snps)
nsnps <- length(snps)
if (nsnps == 1) return(1)
# Write snps to a temporary file
tempfile <- getTempfile(temp.list$dir, prefix=paste("_tmplist_", temp.list$id, "_", sep=""), ext=".txt")
write(snps, file=tempfile, ncolumns=1)
} else {
tempfile <- snps
}
samples <- op[["samples", exact=TRUE]]
sflag <- !is.null(samples)
sampFlag <- op$samplesIsFile
if (sflag) {
if (!sampFlag) {
# Write samples to a temporary file
tempsamp <- getTempfile(temp.list$dir, prefix=paste("_tmpsamp_", temp.list$id, "_", sep=""), ext=".txt")
write(samples, file=tempsamp, ncolumns=1)
} else {
tempsamp <- samples
}
}
# Temporary output file
tempout <- getTempfile(temp.list$dir, prefix=paste("_tmpout_", temp.list$id, "_", sep=""), ext=".txt")
# Create the glu call command
str <- paste("glu ld.tagzilla -r ", op$r2.threshold, " -a ", op$min.MAF,
" --includeloci=", tempfile,
" -O ", tempout, sep="")
if (sflag) str <- paste(str, " --includesamples=", tempsamp, sep="")
str <- paste(str, " ", genoFile, sep="")
callOS(str)
# Read in the matrix
x <- read.table(tempout, as.is=TRUE, header=1, sep="\t")
n <- max(as.numeric(x[, "BINNUM"]), na.rm=TRUE)
# Delete temp files
if (temp.list$delete) {
if (!snpFlag) file.remove(tempfile)
file.remove(tempout)
if ((sflag) && (!sampFlag)) file.remove(tempsamp)
}
if (op$return.data) return(x)
n
} # END: glu.nBins
# Function for returning the snps in a set of bins
glu.SNP_bins <- function(snps, genoFile, op=NULL) {
# op
# samples Character vector of sample ids
op <- default.list(op, c("return.data"), list(1))
x <- glu.nBins(snps, genoFile, op=op)
# LNAME LOCATION MAF BINNUM DISPOSITION
# For each bin, choose the snp with the largest MAF
bin <- as.numeric(makeVector(x[, "BINNUM"]))
snp <- makeVector(x[, "LNAME"])
maf <- as.numeric(makeVector(x[, "MAF"]))
rm(x)
gc()
ubins <- unique(bin)
nbins <- length(ubins)
ret <- character(nbins)
for (i in 1:nbins) {
temp <- bin == ubins[i]
if (sum(temp) == 1) {
ret[i] <- snp[temp]
} else {
j <- which.max(maf[temp])
ret[i] <- (snp[temp])[j]
}
}
ret
} # END: glu.SNP_bins
# Function to convert to a ped file
glu.create_ped <- function(snp.list, pheno.list, op=NULL, info.list=NULL) {
# snp.list File must be a format GLU can read
# pheno.list With names response.var, gender.var, male, female
# op
# map.type 0=file from GLU, 1=format for haploview
# GLU path to GLU
# info.list List with snp info (location) for map file
# id.var SNP column
# loc.var
snp.list <- check.snp.list(snp.list)
pheno.list <- check.pheno.list(pheno.list)
pheno.list <- default.list(pheno.list,
c("response.var", "gender.var", "male", "female"),
list("ERROR", "ERROR", "MALE", "FEMALE"),
error=c(1, 1, 0, 0))
op <- default.list(op, c("temp.list", "map.type", "GLU"), list(list(), 0, "glu"))
temp.list <- check.temp.list(op$temp.list)
GLU <- op$GLU
x <- loadData.table(pheno.list)
print(dim(x))
vars <- c(pheno.list$id.var, pheno.list$response.var, pheno.list$gender.var)
x <- removeOrKeepCols(x, vars)
subids <- makeVector(x[, pheno.list$id.var])
gender0 <- makeVector(x[, pheno.list$gender.var])
response0 <- makeVector(x[, pheno.list$response.var])
gender <- integer(length(gender0))
temp <- gender0 %in% pheno.list$male
gender[temp] <- 1
temp <- gender0 %in% pheno.list$female
gender[temp] <- 2
response <- integer(length(response0))
temp <- response0 == 0
temp[is.na(temp)] <- FALSE
response[temp] <- 1
temp <- response0 == 1
temp[is.na(temp)] <- FALSE
response[temp] <- 2
rm(x, gender0, response0)
gc()
# See if snps are in a file already
snp.file <- op[["snp.file", exact=TRUE]]
snpFlag <- !is.null(snp.file)
# Create file for sample ids
dir <- temp.list$dir
id <- temp.list$id
tmpids <- paste(dir, "ids_", id, ".txt", sep="")
write(subids, file=tmpids, ncolumns=1)
# Temporary output file
tmpfile <- paste(dir, "out_", id, ".ped", sep="")
str <- paste(GLU, " transform -F ped -o ", tmpfile, sep="")
if (snpFlag) str <- paste(str, " --includeloci=", snp.file, sep="")
str <- paste(str, " --includesamples=", tmpids, sep="")
str <- paste(str, " ", snp.list$file, sep="")
print(str)
callOS(str)
# File has columns: Family id, subject id, father id, mother id, gender, phenotype, genotypes
ped <- scan(tmpfile, what="character", sep="\n")
n <- length(ped)
for (i in 1:n) {
x <- getVecFromStr(ped[i], delimiter=" ")
row <- match(x[2], subids)
x[5:6] <- c(gender[row], response[row])
ped[i] <- paste(x, collapse=" ", sep="")
}
out <- op[["out.ped", exact=TRUE]]
if (!is.null(out)) write(ped, file=out, ncolumns=1)
rm(ped)
gc()
out <- op[["out.map", exact=TRUE]]
if (!is.null(out)) {
# Get the map file name
len <- nchar(tmpfile)
tmpmap <- substr(tmpfile, 1, len-4)
tmpmap <- paste(tmpmap, ".map", sep="")
file.copy(tmpmap, out, overwrite=TRUE)
}
# Read the map file
x <- matrix(scan(out, what="character", sep=" "), byrow=TRUE, ncol=4)
# Add locations
if (!is.null(info.list)) {
info.list <- default.list(info.list, c("id.var", "loc.var"), list("ERROR", "ERROR"), error=c(1, 1))
info.list <- check.file.list(info.list)
id.var <- info.list$id.var
y <- loadData.table(info.list)
temp <- x[, 2] %in% y[, id.var]
ids <- match(x[, 2], y[, id.var])
ids <- ids[!is.na(ids)]
x[temp, 4] <- y[ids, info.list$loc.var]
}
if (op$map.type) {
x <- x[, c(2, 4)]
}
write.table(x, file=out, sep=" ", row.names=FALSE, quote=FALSE, col.names=FALSE)
if (temp.list$delete) {
file.remove(tmpids)
file.remove(tmpfile)
file.remove(tmpmap)
}
}
glu.LD.snps <- function(snps.file, geno.file, out.dir, qc.file, subs.file=NULL, GLU="GLU",
id.str="", delta=200000, delete=1, snp.var="LOCUS", chr.var="CHROMOSOME", loc.var="LOCATION") {
tmp.snps <- paste(out.dir, "tmp.snps", id.str, sep="")
tmp.glu <- paste(out.dir, "tmp.glu", id.str, sep="")
out.file <- paste(out.dir, "GLU.LD.snps", id.str, ".txt.xls", sep="")
out.max <- paste(out.dir, "GLU.R2.max", id.str, ".txt.xls", sep="")
# Get the complete set of snps
x <- loadData.table(qc.file)
keep <- c(snp.var, loc.var, chr.var)
x <- x[, keep]
all <- NULL
snps <- scan(snps.file, what="character")
temp <- x[, snp.var] %in% snps
y <- x[temp, , drop=FALSE]
LOC <- as.numeric(x[, loc.var])
CHR <- x[, chr.var]
for (i in 1:nrow(y)) {
snp <- y[i, snp.var]
chr <- y[i, chr.var]
loc <- as.numeric(y[i, loc.var])
temp <- (CHR %in% chr) & (LOC >= loc - delta) & (LOC <= loc + delta)
temp[is.na(temp)] <- FALSE
n <- sum(temp)
print(c(snp, chr, loc, n))
if (!n) next
all <- unique(c(all, x[temp, snp.var]))
}
n <- length(all)
if (!n) stop("ERROR: no SNPs")
write(all, file=tmp.snps, ncolumns=1)
# Call GLU
str <- paste(GLU, " tagzilla --skipbinning -r 0 -m 9999 -a 0.04", sep="")
str <- paste(str, " --saveldpairs=", tmp.glu, sep="")
str <- paste(str, " --includeloci=", tmp.snps, sep="")
if (!is.null(subs.file)) str <- paste(str, " --includesamples=", subs.file, sep="")
str <- paste(str, " ", geno.file, sep="")
print(str)
callOS(str)
# Read in and edit the glu output
x <- read.table(tmp.glu, header=1, sep="\t", as.is=TRUE)
temp <- ((x[, "LNAME1"] %in% snps) | (x[, "LNAME2"] %in% snps)) & (x[, "LNAME1"] != x[, "LNAME2"])
temp[is.na(temp)] <- FALSE
x <- x[temp, , drop=FALSE]
# Put snps in column1
temp <- (x[, "LNAME2"] %in% snps)
if (any(temp)) {
tsnp <- x[temp, "LNAME1"]
x[temp, "LNAME1"] <- x[temp, "LNAME2"]
x[temp, "LNAME2"] <- tsnp
}
x <- sort2D(x, "RSQUARED", fun=as.numeric, dec=TRUE)
x <- sort2D(x, "LNAME1")
writeTable(x, out.file)
# Maximum R^2 for each snp, x is sorted
dup <- duplicated(x[, "LNAME1"])
y <- x[!dup, , drop=FALSE]
writeTable(y, out.max)
# Delete file
if (delete) {
file.remove(tmp.snps)
file.remove(tmp.glu)
}
x
} # END: glu.LD.snps
# Function to extract snps
glu.extract <- function(chr.vec, start.vec, end.vec, geno.file, out.dir, qc.file, subs.file=NULL, GLU="GLU", out.format="sdat", gzip=1,
id.str="", delta=20000, delete=1, snp.var="LOCUS", chr.var="CHROMOSOME", loc.var="LOCATION") {
tmp.snps <- paste(out.dir, "tmp.snps", id.str, sep="")
out.file <- paste(out.dir, "snps", id.str, ".", out.format, sep="")
# Get the complete set of snps
x <- loadData.table(qc.file)
keep <- c(snp.var, loc.var, chr.var)
x <- x[, keep]
temp <- x[, chr.var] %in% chr.vec
x <- x[temp, ]
all <- NULL
LOC <- as.numeric(x[, loc.var])
CHR <- x[, chr.var]
print(x[1:10, ])
#print(sort(LOC))
for (i in 1:length(chr.vec)) {
chr <- chr.vec[i]
a <- start.vec[i]
b <- end.vec[i]
temp <- (CHR %in% chr) & (LOC >= a - delta) & (LOC <= b + delta)
temp[is.na(temp)] <- FALSE
n <- sum(temp)
print(c(chr, a, b, n))
if (!n) next
all <- unique(c(all, x[temp, snp.var]))
}
n <- length(all)
if (!n) stop("ERROR: no SNPs")
write(all, file=tmp.snps, ncolumns=1)
# Call GLU
str <- paste(GLU, " transform -F ", out.format, sep="")
str <- paste(str, " --includeloci=", tmp.snps, sep="")
if (!is.null(subs.file)) str <- paste(str, " --includesamples=", subs.file, sep="")
str <- paste(str, " -o ", out.file, " ", geno.file, sep="")
print(str)
callOS(str)
if (gzip) {
str <- paste("gzip ", out.file, sep="")
print(str)
callOS(str)
}
# Delete file
if (delete) {
file.remove(tmp.snps)
}
NULL
} # END: glu.extract
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.