R/wga_glu.R

Defines functions glu.extract glu.LD.snps glu.create_ped glu.SNP_bins glu.nBins glu.snps_in_genes glu.ldMatrix addMissingSNPs glu.mergePhenoGeno glu.r2 glu.getCounts glu.partition getMAF.control glu.renameAlleles glu.qcsummary glu.commonSNPs glu.ginfo glu.mergeAlleles glu.transform

# History Jul 02 2008 Initial coding
#         Nov 12 2008 Add glu.mergeAlleles
#         Dec 12 2008 Add glu.qcsummary
#         Dec 16 2008 Add options list to glu.mergeAlleles
#                     Add function glu.renameAlleles
#         Feb 02 2009 Update glu.qcsummary for samples
#                     Add function getMAF.control
#         Feb 17 2009 Update getMAF.control for subsetting the phenotype data
#         May 21 2009 Add glu.partition function
#         Jun 10 2009 Update glu.partition for cases and controls
#         Jul 07 2009 Add function glu.getCounts
#         Jul 20 2009 Fix bug in glu.partition with obtaining ccLevels
#         Jul 22 2009 Add error checks in glu.partition
#         Aug 10 2009 Add glu.r2 function
#         Oct 19 2009 Add glu.mergePhenoGeno function
#         Nov 02 2009 NO CHANGE
#         Dec 22 2009 Add glu.setUp function for 
#         Dec 28 2009 Put glu.setUp in wga_util2.R
#                     Add options for glu.partition
#         Jan 11 2010 Update temporary files in glu.partition 
#         Jan 14 2010 In glu.partition, allow data to be passed in
#         Sep 13 2010 Add function for obtaining r2 matrix
#         Oct 18 2010 Add function for getting the snps in genes
#         Oct 19 2010 Add function for retuning the number of bins using GLU
#         Oct 22 2010 Add function for returning the snps in bins based on
#                     an R^2 threshold and MAF.
#         Oct 25 2010 Change option names in glu.nBins
#         Nov 12 2010 Add function to convert to a ped file for Haploview
#         Nov 16 2010 Change options in ldmatrix and add function to add missing snps
#         Jan 21 2011 Convert mat to a matrix in addMissingSNPs
#         Sep 12 2013 Add function to find snps in LD with a set of snps
#         Sep 13 2013 Add function to extract snps
#         Sep 19 2013 Add GLU option and info.list to create_ped function

# Function to open a connection and transfrom data
glu.transform <- function(infile, inFormat=NULL, outFormat="ldat") {

  # infile      Path to file to open a connection
  #             No default
  # inFormat    GLU format of infile  
  #             The default is NULL
  # outFormat   The GLU format of the transformed data
  #             The default is "ldat".

  command <- "glu transform"
  if (!is.null(inFormat)) command <- paste(command, " -f ", inFormat, sep="")
  command <- paste(command, " ", infile, sep="")
  if (!is.null(outFormat)) command <- paste(command, " -F ", outFormat, sep="")
  
  fid <- pipe(command, open="r")

} # END: glu.transform

# Function for changing the alleles using GLU
glu.mergeAlleles <- function(snp1.list, snp2.list, temp.list=NULL,
                     op=NULL) {

  # snp1.list and snp2.list can also have the field "qcsummary".

  # snp1.list     Base genotype data list
  # snp2.list     
  # temp.list
  ###########################################################
  # op            List with names:
  # outfile       Updated genotype file for snp2.list
  # include.loci
  # flip2         0 or 1 for flipping alleles in snp2.list
  ###########################################################

  # Check the options list
  op <- default.list(op,
         c("outfile", "flip2"), list("out.txt", 0))

  outfile      <- op$outfile
  flip2        <- op$flip2
  include.loci <- getListName(op, "include.loci")
  rm(op)
  temp <- gc(verbose=FALSE)

  temp.list <- check.temp.list(temp.list)
  snp1.list <- check.snp.list(snp1.list)
  snp2.list <- check.snp.list(snp2.list)

  # Check if the qc.summary files exist
  tmp1 <- getListName(snp1.list, "qcsummary")
  if (is.null(tmp1)) {
    tmp1 <- getTempfile(temp.list$dir, prefix="snp1", ext=".txt")  
    temp <- glu.qcsummary(snp1.list, out.loci=tmp1, include.loci=include.loci)
  }
  tmp2 <- getListName(snp2.list, "qcsummary")
  if (is.null(tmp2)) {
    tmp2 <- getTempfile(temp.list$dir, prefix="snp2", ext=".txt") 
    temp <- glu.qcsummary(snp2.list, out.loci=tmp2, include.loci=include.loci)
  }

  # Get the snp names and alleles
  vars  <- c("LOCUS", "NUM_ALLELES", "ALLELES")
  tlist <- list(file=tmp1, file.type=3, delimiter="\t", header=1) 
  s1    <- getColumns(tlist, vars, temp.list=temp.list)
  tlist$file <- tmp2
  s2    <- getColumns(tlist, vars, temp.list=temp.list)
  names(s1$ALLELES)     <- s1$LOCUS
  names(s1$NUM_ALLELES) <- s1$LOCUS
  names(s2$ALLELES)     <- s2$LOCUS
  names(s2$NUM_ALLELES) <- s2$LOCUS

  snps <- intersect(s1$LOCUS, s2$LOCUS)
  
  # Remove snps = "*" (last row of qc.summary file)
  snps <- snps[snps != "*"]
  if (!length(snps)) {
    print("No intersecting SNPs")
    return(0)
  }
  a1   <- s1$ALLELES[snps]
  a2   <- s2$ALLELES[snps]
  n1   <- as.numeric(s1$NUM_ALLELES[snps])
  n2   <- as.numeric(s2$NUM_ALLELES[snps])

  # Delete temporary files
  if (temp.list$delete) {
    file.remove(tmp1)
    file.remove(tmp2)
  }
  
  rm(s1, s2, tmp1, tmp2)
  temp <- gc()

  # Check for more than 2 alleles
  temp <- (n1 > 2) + (n2 > 2)
  if (any(temp)) {
    stop("NUM_ALLELES > 2 for some SNPs")
  }

  # Seperate the alleles
  nsnp     <- length(a1)
  mat1     <- matrix(data="", nrow=nsnp, ncol=2)
  mat2     <- matrix(data="", nrow=nsnp, ncol=2)
  mat1[,1] <- substr(a1, 1, 1)
  mat1[,2] <- substr(a1, 3, 3)
  mat2[,1] <- substr(a2, 1, 1)
  mat2[,2] <- substr(a2, 3, 3)
  rm(a1, a2)
  temp <- gc()

  # Create a temporary file for the allele re-naming
  tmp <- getTempfile(temp.list$dir, prefix="allele", ext=".txt")  

  # Open and initialize the file
  temp <- c("SNP", "OLD_ALLELES", "NEW_ALLELES")
  fid  <- writeVecToFile(temp, tmp, colnames=NULL, type=3, close=0,
                           sep="\t")
  
  reverse <- 2:1

  # Loop over each snp
  for (i in 1:nsnp) {
    n2i <- n2[i]
    n1i <- n1[i]
    if (!n2i) next

    m1 <- mat1[i, 1:n1i]
    m2 <- mat2[i, 1:n2i]

    # See if the alleles match
    flag <- (all(m2 %in% m1)) || (all(m1 %in% m2))

    # If they agree and there is no flip, then do nothing
    if (flag) {
      if (!flip2) next 

      # Alleles agree and there is a flip
      if (n2i == 2) {
        old <- paste(m2, collapse=",", sep="")
        new <- paste(m2[reverse], collapse=",", sep="")
      } else {
        old <- m2[1]
        if (n1i == 2) {
          # Get the other allele
          if (old == m1[1]) {
            new <- m1[2]
          } else {
            new <- m1[1]
          }
        } else if (n1i == 1) {
          new <- m1[1]
          if (new == old) next
        } else {
          # n1[i] is 0. Do nothing
          next
        }
      }
    } else {
      if (!n1i) next

      # Alleles do not match
      old <- paste(m2, collapse=",", sep="")
      new <- paste(m1, collapse=",", sep="")
      if (n1i == 1) {
        new <- m1[1]
        old <- m2[1]
      } else if (n2i == 1) {
        new <- m1[1]
      }
      if (old == new) next 
    }

    # Write to file
    temp <- c(snps[i], old, new)
    write(temp, file=fid, ncolumns=3, sep="\t")

  } 

  close(fid)

  # Rename the alleles
  temp <- glu.renameAlleles(snp2.list, tmp, outfile=outfile)

  # Delete temporary file
  if (temp.list$delete) file.remove(tmp)
    
  0

} # END: glu.mergeAlleles

# Function for calling ginfo
glu.ginfo <- function(snp.list, outloci="out.txt") {

  snp.list  <- check.snp.list(snp.list)
  type      <- snp.list$file.type
  if (type == 2) type <- "ldat"
  if (type == 3) type <- "sdat"
  
  snp     <- paste(snp.list$dir, snp.list$file, sep="")
  command <- paste("glu ginfo --outputloci=", outloci, sep="")
  command <- paste(command, " -f ", type, " ", snp, ":unique=n", sep="")
  ret     <- callOS(command)
  ret

} # END: glu.ginfo

# Function to get the common snps
glu.commonSNPs <- function(slist, temp.list=NULL, outfile=NULL) {

  # slist     List of sublists of type snp.list

  temp.list <- check.temp.list(temp.list)
  tmpfile   <- getTempfile(temp.list$dir, prefix="snp", ext=".txt") 
  tlist     <- list(file=tmpfile, file.type=3, delimiter="\t", header=1) 
  n         <- length(slist)
  
  for (i in 1:n) {
    temp <- glu.ginfo(slist[[i]], outloci=tmpfile)
    temp <- getColumns(tlist, "LOCUS", temp.list=temp.list)
    if (i == 1) {
      snps <- temp[[1]]
    } else {
      snps <- intersect(snps, temp[[1]])
    }
  }
  snps <- unique(snps)

  if (!is.null(outfile)) write(snps, file=outfile, ncolumns=1)

  snps

} # END: glu.commonSNPs

# Function to call qc.summary
glu.qcsummary <- function(snp.list, out.loci="out.txt", include.loci=NULL,
                  include.samples=NULL) {

  snp.list  <- check.snp.list(snp.list)
  
  snp     <- paste(snp.list$dir, snp.list$file, sep="")
  command <- paste("glu qc.summary --locusout=", out.loci, sep="")
  command <- paste(command, " ", snp, sep="")
  if (!is.null(include.loci)) {
    command <- paste(command, " --includeloci=", include.loci, sep="")
  }
  if (!is.null(include.samples)) {
    command <- paste(command, " --includesamples=", include.samples, sep="")
  }
  ret <- callOS(command)
  ret

} # END: glu.qcsummary

# Function to rename alleles
glu.renameAlleles <- function(snp.list, alleleFile, outfile="out.txt") {

  snp.list <- check.snp.list(snp.list)
  type     <- snp.list$file.type
  if (type == 2) type <- "ldat"
  if (type == 3) type <- "sdat"
  
  snp     <- paste(snp.list$dir, snp.list$file, sep="")
  command <- paste("glu transform -o ", outfile, sep="")
  command <- paste(command, " -f ", type, sep="")
  command <- paste(command, " --renamealleles=", alleleFile, sep="")
  command <- paste(command, " ", snp, sep="")
  ret     <- callOS(command)
  ret


} # END: glu.renameAlleles

# Function to compute the MAF in controls using GLU
getMAF.control <- function(snp.list, pheno.list, data, op=NULL, temp.list=NULL) {
 
  # Check the lists
  snp.list   <- check.snp.list(snp.list)
  pheno.list <- check.pheno.list(pheno.list)
  temp.list  <- check.temp.list(temp.list)

  # Get the subset of controls
  x <- read.table(pheno.list$file, header=pheno.list$header, sep=pheno.list$delimiter)

  temp <- getListName(pheno.list, "subsetData")
  if (!is.null(temp)) {
    x <- subsetData.list(x, temp)
  }

  vars <- c(pheno.list$id.var, pheno.list$response.var)
  x    <- removeOrKeepCols(x, vars, which=1)
  x    <- unfactor.all(x)

  # Get the controls 
  temp <- x[, pheno.list$response.var] == 0
  x    <- removeOrKeepRows(x, temp, which=1) 
  
  # Create a temporary file for the subject ids
  tmp <- getTempfile(temp.list$dir, prefix="samples", ext=".txt") 
  
  # Write the samples ids
  write(x[, pheno.list$id.var], file=tmp, ncolumns=1)

  # Create a temporary file for the snp ids
  tmp2 <- getTempfile(temp.list$dir, prefix="snps", ext=".txt") 
  
  # Write the samples ids
  write(data[, "SNP"], file=tmp2, ncolumns=1)

  rm(x, temp, vars)
  temp <- gc(verbose=FALSE)

  # Create a temporary file for GLU output
  gluout <- getTempfile(temp.list$dir, prefix="glu", ext=".txt") 

  # Call GLU
  temp <- glu.qcsummary(snp.list, out.loci=gluout, include.loci=tmp2,
                  include.samples=tmp)

  # Add the MAF column
  temp <- list(file=gluout, file.type=3, delimiter="\t", header=1, id.var="LOCUS", vars="MAF", names="MAF")
  data <- addColumn(data, "SNP", temp)

  # Delete files
  if (temp.list$delete) {
    file.remove(tmp)
    file.remove(gluout)
    file.remove(tmp2)
  }

  data

} # END: getMAF.control

# Function to get info on which snps belong to which study
glu.partition <- function(snp.list, pheno.list, temp.list=NULL, op=NULL) {

  # Check the lists
  op         <- default.list(op, c("byCC"), list(0), error=c(0))
  snp.list   <- check.snp.list(snp.list)
  pheno.list <- check.pheno.list(pheno.list)
  temp.list  <- check.temp.list(temp.list)
  pheno.list <- default.list(pheno.list, c("partition.var", "is.the.data"),
                  list("ERROR", 0), error=c(1, 0))
  idVal      <- temp.list$id
  
  if (temp.list$dir %in% c("", " ")) temp.list$dir <- "./"

  # Check if byCC option was specified
  byCC <- op$byCC
  if (byCC) {
    temp <- pheno.list[["cc.var", exact=TRUE]]
    if (is.null(temp)) {
      stop("ERROR: cc.var not specified in pheno.list")
    } 
  } else {
    ccLevels <- 1
  }
 
  pvar <- pheno.list$partition.var
  id   <- pheno.list$id.var
  x    <- readTable(pheno.list$file, pheno.list)
  pv   <- makeVector(unfactor(x[, pvar]))
  id   <- makeVector(unfactor(x[, id]))
  if (byCC) {
    cv <- makeVector(unfactor(x[, pheno.list$cc.var]))
    ccLevels <- unique(cv)
  }
  nr <- nrow(x)
  rm(x)
  gc()
 
  # Get the partition levels
  levels <- unique(pv)
  
  # Create a temporary file for the subject ids
  tmpfile <- getTempfile(temp.list$dir, prefix=paste("samples_", idVal, sep=""), ext=".txt") 

  # Create a temporary file for the qc output
  tmpqc <- getTempfile(temp.list$dir, prefix=paste("qcout_", idVal, sep=""), ext=".txt") 

  # GLU variables
  vars <- c("NUM_GENOTYPES", "GENOTYPE_COUNTS")

  # Loop over the levels
  x <- NULL
  for (lev in levels) {
    print(lev)
    temp0 <- (pv == lev)
    temp0[is.na(temp0)] <- FALSE
    
    for (cc in ccLevels) {

      if (byCC) {
        temp <- temp0 & (cv == cc)
        temp[is.na(temp)] <- FALSE
      } else {
        temp <- temp0
      }
      s <- sum(temp)
      print(s)
      if (!s) next
      
      # Write out the ids
      write(id[temp], file=tmpfile, ncolumns=1)

      # Call qc.summary
      temp <- glu.qcsummary(snp.list, out.loci=tmpqc, include.loci=NULL,
                  include.samples=tmpfile)

      # New variable names
      vname <- paste(pvar, ".", lev, sep="")
      new  <- c(vname, paste(vname, ".COUNTS", sep=""))
      if (byCC) new <- paste(new, ".", cc, sep="")

      # Add the info
      if (is.null(x)) {
        tlist <- list(what="character", returnMatrix=1, include.row1=1, delimiter="\t")
        x <- scanFile(tmpqc, tlist)
        x <- x[, c("LOCUS", "NUM_GENOTYPES", "GENOTYPE_COUNTS")]
        x <- data.frame(x)
        x <- unfactor.all(x)
        x <- renameVar(x, "LOCUS", "SNP")
        x <- renameVar(x, vars[1], new[1])
        x <- renameVar(x, vars[2], new[2])
        x[, new[1]] <- as.numeric(x[, new[1]])
        partVars <- new[1]
      } else {
        tlist <- list(file=tmpqc, file.type=3, header=1, delimiter="\t",
                      id.var="LOCUS", vars=vars, names=new, type=c("N", "C"))
        x <- addColumn(x, "SNP", tlist)
        partVars <- c(partVars, new[1])
      }

      # Delete files
      file.remove(tmpfile)
      file.remove(tmpqc)

    } # END: for (cc in ccLevels) 
  } # END: for (lev in levels

  # Determine which snps belonged to all studies
  v <- "ALL_PARTS"
  x[, v] <- 1
  for (var in partVars) x[, v] <- as.numeric(x[, v] & (x[, var] > 0)) 

  out <- op[["outfile", exact=TRUE]]
  if (!is.null(out)) writeTable(x, out)

  # Return the SNPs
  temp <- x[, v] == 1
  temp[is.na(temp)] <- FALSE
  snps <- makeVector(x[temp, "SNP"])

  snps

} # END: glu.partition 

# Function to get (study specific geno counts)
glu.getCounts <- function(snp.list, pheno.list, temp.list=NULL, op=NULL) {

  # op        List with names
  #  byCC     0 or 1
  #           The default is 1
  #  by.var   Name of variable on phenotype data for seperate counts
  #           The default is NULL
  #  snps     List of type file.list or a character vector of snps
  #           The default is NULL

  # Check lists
  snp.list   <- check.snp.list(snp.list)
  pheno.list <- check.pheno.list(pheno.list)
  temp.list  <- check.temp.list(temp.list)
  op         <- default.list(op, c("byCC"), list(1))
  
  # Get the by variable
  byVar <- getListName(op, "by.var")
  if (is.null(byVar)) {
    byFlag <- 0
  } else {
    byFlag <- 1
  }
  
  # Get the case-control variable
  ccVar <- getListName(pheno.list, "cc.var")
  if (is.null(ccVar)) op$byCC <- 0
  ccFlag <- op$byCC

  # Get the snps
  snpFlag <- 1
  snps <- getListName(op, "snps")
  if (is.null(snps)) snpFlag <- 0
  if (snpFlag) {
    if (typeof(snps) != "character") {
      snps <- scan(snps$file, what="character", sep="\n")
    }

    # Write these snps out to a temporary file
    tmpfile <- getTempfile(temp.list$dir, prefix="snps", ext=".txt") 
    write(snps, file=tmpfile, ncolumns=1)
    rm(snps)
    gc()
  }
  
  # Create temporary file for GLU output
  tmpglu <- getTempfile(temp.list$dir, prefix="gluOut", ext=".txt") 

  # Create string 
  str <- paste("glu transform -F ldat -o ", tmpglu, sep="")
  if (snpFlag) str <- paste(str, " --includeloci=", tmpfile, sep="")
  str <- paste(str, " ", snp.list$file, sep="")
  print(str)
  temp <- callOS(str)

  # Update snp.list
  snp.list$file        <- tmpglu
  snp.list$file.type   <- 2
  snp.list$stream      <- 0
  pheno.list$keep.vars <- c(pheno.list$id.var, ccVar, byVar)

  # Get the data
  tlist <- list(include.row1=0, include.snps=0, return.type=1, MAF=0,
                missing=1, snpNames=1, orderByPheno=1, return.pheno=1)
  temp  <- try(getData.1(snp.list, pheno.list, temp.list, op=tlist),
               silent=TRUE)
  if (class(temp) == "try-error") {
    print(temp)
    stop("ERROR loading data")
  }

  snpData   <- temp$data
  #missing   <- temp$missing
  snpNames  <- temp$snpNames
  delimiter <- "\t"
  nsnps     <- length(snpData)

  # Get the phenotype data
  phenoData.list <- temp$phenoData.list
  phenoData0     <- phenoData.list$data
  nr             <- nrow(phenoData0)

  if (ccFlag) cc <- makeVector(phenoData0[, ccVar])
  if (byFlag) by <- makeVector(phenoData0[, byVar])
  rm(temp, phenoData.list, phenoData0)
  gc()

  # Remove temporary files
  file.remove(tmpfile)
  file.remove(tmpglu)

  # Get the unique levels of ccVar and byVar
  if (ccFlag) {
    ccLevels <- unique(cc)
    ncc      <- length(ccLevels)
  } else {
    ncc      <- 1
  }
  if (byFlag) {
    byLevels <- unique(by)
    nby      <- length(byLevels)
  } else {
    nby      <- 1
  }

  # Initialize return data frame
  ret <- data.frame(snpNames)
  colnames(ret) <- "SNP"
  rm(snpNames)
  gc()

  # Loop over each level
  for (i in 1:nby) {
    if (byFlag) {
      print(byLevels[i])
      tempBy <- by == byLevels[i]
      tempBy[is.na(tempBy)] <- FALSE
    } else {
      tempBy <- rep(TRUE, times=nr)
    }
    for (j in 1:ncc) {
      if (byFlag) {
        print(ccLevels[j])
        tempCC <- cc == ccLevels[j]
        tempCC[is.na(tempCC)] <- FALSE
      } else {
        tempCC <- rep(TRUE, times=nr)
      }

      # Get the correct subset
      rows <- tempBy & tempCC
      
      # Get the new variable name
      if (byFlag) new <- byLevels[i]
      if (ccFlag) {
        new <- paste(new, ".", ccLevels[j], sep="")
      }
      ret[, new] <- NA 

      # Loop over each SNP
      for (k in 1:nsnps) {
        snp         <- as.integer(getVecFromStr(snpData[k], delimiter=delimiter))
        counts      <- getGenoCounts(snp[rows], exclude=c(NA, NaN), check=1) 
        ret[k, new] <- paste(counts, collapse="|", sep="")
      }
    }
  } # END: for (i in 1:nby)

  # Output file
  out <- getListName(op, "outfile")
  if (!is.null(out)) writeTable(ret, out)

  ret

} # END: glu.getCounts

# Function to get r2 and dprime.
# Subjects to use is determined by pheno.list$subsetData. Set pheno.list
#  to use all subjects in the genotype data.
# Use snp.list$snpNames to specify the snps
glu.r2 <- function(snp.list, pheno.list=NULL, op=NULL) {

  # op            List with names
  #  outfile

  # Check lists
  snp.list   <- check.snp.list(snp.list)
  if (!is.null(pheno.list)) {
    pheno.list <- check.pheno.list(pheno.list)
    pflag      <- 1
  } else {
    pflag      <- 0
  }
  op         <- default.list(op, c("outfile"), list("ERROR"), error=1)
  temp.list  <- getListName(op, "temp.list")
  temp.list  <- check.temp.list(temp.list)
  
  # Write out subject ids
  if (pflag) {
    temp  <- getPhenoData(pheno.list, temp.list=temp.list)$data
    tfile <- getTempfile(temp.list$dir, prefix="subs_r2_", ext=".txt")
    temp  <- makeVector(temp[, pheno.list$id.var])
    write(temp, file=tfile, ncolumns=1)
  }
 
  # Write out snps
  snps <- getListName(snp.list, "snpNames")
  sflag <- !is.null(snps)
  if (sflag) {
    snpfile <- getTempfile(temp.list$dir, prefix="SNP_r2_", ext=".txt")
    write(snp.list$snpNames, file=snpfile, ncolumns=1)
    snpStr <- paste(" --includeloci=", snpfile, sep="")
  } else {
    snpStr <- paste(" --includeloci=", snp.list$snpNames.list$file, sep="")
  }
  
  # Call GLU
  str <- "glu tagzilla --skipbinning -r 0"
  str <- paste(str, " --saveldpairs=", op$outfile, sep="")
  str <- paste(str, snpStr, sep="")
  if (pflag) str <- paste(str, " --includesamples=", tfile, sep="")
  str <- paste(str, " ", snp.list$file, sep="")
  callOS(str)
  
  # Delete file
  if (temp.list$delete) {
    if (sflag) file.remove(snpfile) 
    if (pflag) file.remove(tfile)
  }

  # Read in and edit the glu output
  x <- read.table(op$outfile, header=1, sep="\t", as.is=FALSE)
  x <- unfactor.all(x)
  temp <- x[, "LNAME1"] == x[, "LNAME2"]
  temp[is.na(temp)] <- FALSE
  x <- removeOrKeepRows(x, temp, which=-1)

  writeTable(x, op$outfile)

  x

} # END: glu.r2

# Function to merge phenotype and genotype data
glu.mergePhenoGeno <- function(snp.list, pheno.list, op=NULL) {

  # op          List of options for mergePhenoGeno and temp.list
  #  outfile
  #  which      0-2 for the coding

  snp.list   <- check.snp.list(snp.list)
  pheno.list <- check.pheno.list(pheno.list)
  temp.list  <- getListName(op, "temp.list")
  temp.list  <- check.temp.list(temp.list)
  snp.list   <- default.list(snp.list, "snpNames", list("ERROR"), error=1)

  # Temp file for ldat and snpNames
  tmpfile <- getTempfile(temp.list$dir, prefix="SNP", ext=".ldat")
  tmp     <- getTempfile(temp.list$dir, prefix="SNPNAMES", ext=".txt")
  temp    <- getListName(snp.list, "snpNames")
  write(temp, file=tmp, ncolumns=1)  

  str <- "glu transform -F ldat -o "
  str <- paste(str, tmpfile, sep="")
  str <- paste(str, " --includeloci=", tmp, sep="")
  str <- paste(str, " --orderloci=", tmp, sep="")
  str <- paste(str, " ", snp.list$file, sep="") 
  callOS(str)

  snp.list$file      <- tmpfile
  snp.list$file.type <- 2
  snp.list$delimiter <- "\t"
  x <- mergePhenoGeno(snp.list, pheno.list, temp.list=temp.list, op=op)

  if (temp.list$delete) {
    file.remove(tmpfile)
    file.remove(tmp)
  }

  x

} # END: glu.mergePhenoGeno

# Function to add missing SNPs in ld matrix
addMissingSNPs <- function(snps, mat) {

  n <- length(snps)
  nr <- nrow(mat)
  if (n == nr) return(mat)

  cnames <- colnames(mat)
  temp   <- !(snps %in% cnames)
  miss   <- snps[temp]
  nmiss  <- length(miss) 
  mat    <- as.matrix(mat)
  colnames(mat) <- NULL
  rownames(mat) <- NULL
  mat    <- rbind(mat, matrix(0, nrow=nmiss, ncol=nr))
  mat    <- cbind(mat, matrix(0, nrow=nrow(mat), ncol=nmiss))
  cnames <- c(cnames, miss)  
  rownames(mat) <- cnames
  colnames(mat) <- cnames
  diag(mat)     <- 1
  
  mat
}

# Function for returning an ld matrix
glu.ldMatrix <- function(snps, genoFile, op=NULL) {

  temp.list <- op[["temp.list", exact=TRUE]]
  temp.list <- check.temp.list(temp.list)
  op <- default.list(op, c("maxdist", "minmaf", "measure", "order", "addMiss"), 
                 list(200, 0.05, "r2", 1, 1))

  # Determine where it is running
  #host <- callOS("hostname", inter=TRUE)
  #if (host == "biowulf.nih.gov) {
  #  bioFlag <- 1
  #} else {
  #  bioFlag <- 0
  #}

  snps <- unique(snps)
  nsnps <- length(snps)

  if (nsnps == 1) {
    x <- matrix(data=1.0, nrow=1, ncol=1)
    rownames(x) <- snps
    colnames(x) <- snps
    return(x)
  }

  # Write snps to a temporary file
  tempfile <- getTempfile(temp.list$dir, prefix=paste("_tmplist_", temp.list$id, "_", sep=""), ext=".txt")
  write(snps, file=tempfile, ncolumns=1)

  # Temporary output file
  tempout <- getTempfile(temp.list$dir, prefix=paste("_tmpout_", temp.list$id, "_", sep=""), ext=".txt")
  
  # Create the glu call command
  str <- paste("glu ld.matrix --maxdist=", op$maxdist, " --minmaf=", op$minmaf, 
               " --measure=", op$measure, " --includeloci=", tempfile,
               " --output=", tempout, sep="")
  # Loci description file
  #temp <- op[["loci.file", exact=TRUE]]
  #if ((is.null(temp)) && ())
 
  str <- paste(str, " ", genoFile, sep="")
  callOS(str)
 
  # Read in the matrix
  x <- read.table(tempout, as.is=TRUE, header=1, row.names=1, sep="\t")
  n <- ncol(x)
  for (i in 1:(n-1)) {
    cols <- (i+1):n
    x[i, cols] <- x[cols, i]
  }
  temp <- is.na(x)
  x[temp] <- 0

  # Delete temp files
  if (temp.list$delete) {
    file.remove(tempfile)
    file.remove(tempout)
  }

  # Add missing snps if needed
  if (op$addMiss) x <- addMissingSNPs(snps, x)

  # Check for error
  if (nsnps != nrow(x)) stop("ERROR in glu.ldMatrix: incorrect dimension of LD matrix")

  # Order the same as the input list of snps
  if (op$order) x <- x[snps, snps]
  
  x

} # glu.ldMatrix

# Function for getting the snps in a list of genes
glu.snps_in_genes <- function(gene.list, op=NULL) {

  keep <- c("LOCUS",  "CHROMOSOME",  "LOCATION", "FEATURE_NAME", "FEATURE_TYPE")

  op <- default.list(op, c("d", "u", "keep.vars"), list(70000, 70000, keep))
  temp.list <- op[["temp.list", exact=TRUE]]
  temp.list <- check.temp.list(temp.list)
  dir <- temp.list$dir  
  id <- temp.list$id

  genoFile <- op[["geno.file", exact=TRUE]]
  genoFlag <- !is.null(genoFile)

  tmpfile <- getTempfile(dir, paste("genedb_", id, sep=""), ext=".txt")
  tmpgene <- getTempfile(dir, paste("genes_", id, sep=""), ext=".txt")
  write(gene.list, file=tmpgene, ncolumns=1)

  str <- paste("glu genedb.find_snps -u ", op$u, " -d ", op$d, 
               " -o ", tmpfile, " ", tmpgene, sep="")
  print(str)
  callOS(str)

  x <- loadData.table(tmpfile)
  keep <- op[["keep.vars", exact=TRUE]]
  if (!is.null(keep)) x <- x[, keep]

  if (genoFlag) {
    str <- paste("glu ginfo --outputloci=", tmpfile, ":c=1 ", genoFile, sep="")
    print(str)
    callOS(str)
    snps <- scan(tmpfile, what="character", sep="\n")
    temp <- x[, "LOCUS"] %in% snps
    x <- x[temp, ]
    rm(snps)
    gc()
  }

  if (temp.list$delete) {
    file.remove(tmpfile)
    file.remove(tmpgene)
  }

  temp <- op[["out.file", exact=TRUE]]
  if (!is.null(temp)) writeTable(x, temp)
  
  x

} # END: glu.snps_in_genes

# Function for returning the number of bins
glu.nBins <- function(snps, genoFile, op=NULL) {

  op <- default.list(op, c("r2.threshold", "min.MAF", "temp.list", "return.data", "snpsIsFile", "samplesIsFile"), 
                    list(0.8, 0.05, list(), 0, 0, 0))
  temp.list <- check.temp.list(op$temp.list)

  snpFlag <- op$snpsIsFile
  if (!snpFlag) {
    snps <- unique(snps)
    nsnps <- length(snps)
    if (nsnps == 1) return(1)

    # Write snps to a temporary file
    tempfile <- getTempfile(temp.list$dir, prefix=paste("_tmplist_", temp.list$id, "_", sep=""), ext=".txt")
    write(snps, file=tempfile, ncolumns=1)
  } else {
    tempfile <- snps
  }

  samples  <- op[["samples", exact=TRUE]]
  sflag    <- !is.null(samples)
  sampFlag <- op$samplesIsFile
  if (sflag) {
    if (!sampFlag) {
      # Write samples to a temporary file
      tempsamp <- getTempfile(temp.list$dir, prefix=paste("_tmpsamp_", temp.list$id, "_", sep=""), ext=".txt")
      write(samples, file=tempsamp, ncolumns=1)
    } else {
      tempsamp <- samples
    }
  }

  # Temporary output file
  tempout <- getTempfile(temp.list$dir, prefix=paste("_tmpout_", temp.list$id, "_", sep=""), ext=".txt")
  
  # Create the glu call command
  str <- paste("glu ld.tagzilla -r ", op$r2.threshold, " -a ", op$min.MAF, 
               " --includeloci=", tempfile,
               " -O ", tempout, sep="")
  if (sflag) str <- paste(str, " --includesamples=", tempsamp, sep="")
  str <- paste(str, " ", genoFile, sep="")
  callOS(str)
 
  # Read in the matrix
  x <- read.table(tempout, as.is=TRUE, header=1, sep="\t")
  n <- max(as.numeric(x[, "BINNUM"]), na.rm=TRUE)
  
  # Delete temp files
  if (temp.list$delete) {
    if (!snpFlag) file.remove(tempfile)
    file.remove(tempout)
    if ((sflag) && (!sampFlag)) file.remove(tempsamp)
  }

  if (op$return.data) return(x)

  n

} # END: glu.nBins

# Function for returning the snps in a set of bins
glu.SNP_bins <- function(snps, genoFile, op=NULL) {

  # op    
  #  samples        Character vector of sample ids

  op <- default.list(op, c("return.data"), list(1))

  x <- glu.nBins(snps, genoFile, op=op) 

  # LNAME         LOCATION    MAF           BINNUM      DISPOSITION

  # For each bin, choose the snp with the largest MAF
  bin <- as.numeric(makeVector(x[, "BINNUM"]))
  snp <- makeVector(x[, "LNAME"])  
  maf <- as.numeric(makeVector(x[, "MAF"]))
  rm(x)
  gc()

  ubins <- unique(bin)
  nbins <- length(ubins)
  ret   <- character(nbins)
  for (i in 1:nbins) {
    temp <- bin == ubins[i]
    if (sum(temp) == 1) {
      ret[i] <- snp[temp]
    } else {
      j <- which.max(maf[temp])
      ret[i] <- (snp[temp])[j]   
    }
  }

  ret

} # END: glu.SNP_bins

# Function to convert to a ped file
glu.create_ped <- function(snp.list, pheno.list, op=NULL, info.list=NULL) {
 
  # snp.list        File must be a format GLU can read
  # pheno.list      With names response.var, gender.var, male, female
  # op       
  #   map.type      0=file from GLU, 1=format for haploview
  #   GLU           path to GLU
  # info.list       List with snp info (location) for map file
  #   id.var        SNP column
  #   loc.var


  snp.list <- check.snp.list(snp.list)
  pheno.list <- check.pheno.list(pheno.list)
  pheno.list <- default.list(pheno.list, 
                c("response.var", "gender.var", "male", "female"), 
                list("ERROR", "ERROR", "MALE", "FEMALE"), 
                error=c(1, 1, 0, 0))
  op <- default.list(op, c("temp.list", "map.type", "GLU"), list(list(), 0, "glu"))
  temp.list <- check.temp.list(op$temp.list)
  GLU       <- op$GLU

  x         <- loadData.table(pheno.list)
  print(dim(x))
  vars      <- c(pheno.list$id.var, pheno.list$response.var, pheno.list$gender.var)
  x         <- removeOrKeepCols(x, vars)
  subids    <- makeVector(x[, pheno.list$id.var])
  gender0   <- makeVector(x[, pheno.list$gender.var])
  response0 <- makeVector(x[, pheno.list$response.var])
  gender    <- integer(length(gender0))
  temp      <- gender0 %in% pheno.list$male
  gender[temp] <- 1
  temp      <- gender0 %in% pheno.list$female
  gender[temp] <- 2
  response  <- integer(length(response0))
  temp     <- response0 == 0
  temp[is.na(temp)] <- FALSE
  response[temp] <- 1
  temp     <- response0 == 1
  temp[is.na(temp)] <- FALSE
  response[temp] <- 2

  rm(x, gender0, response0)
  gc()

  # See if snps are in a file already
  snp.file <- op[["snp.file", exact=TRUE]]
  snpFlag  <- !is.null(snp.file) 

  # Create file for sample ids
  dir <- temp.list$dir
  id  <- temp.list$id
  tmpids <- paste(dir, "ids_", id, ".txt", sep="")
  write(subids, file=tmpids, ncolumns=1)

  # Temporary output file
  tmpfile <- paste(dir, "out_", id, ".ped", sep="")

  str <- paste(GLU, " transform -F ped -o ", tmpfile, sep="")
  if (snpFlag) str <- paste(str, " --includeloci=", snp.file, sep="")
  str <- paste(str, " --includesamples=", tmpids, sep="")
  str <- paste(str, " ", snp.list$file, sep="")
  print(str)
  callOS(str)

  # File has columns: Family id, subject id, father id, mother id, gender, phenotype, genotypes
  ped <- scan(tmpfile, what="character", sep="\n")
  n   <- length(ped)
  
  for (i in 1:n) {
    x      <- getVecFromStr(ped[i], delimiter=" ")
    row    <- match(x[2], subids)
    x[5:6] <- c(gender[row], response[row]) 
    ped[i] <- paste(x, collapse=" ", sep="")  
  }

  out <- op[["out.ped", exact=TRUE]]
  if (!is.null(out)) write(ped, file=out, ncolumns=1)
  rm(ped)
  gc() 

  out <- op[["out.map", exact=TRUE]]
  if (!is.null(out)) {
    # Get the map file name
    len <- nchar(tmpfile)
    tmpmap <- substr(tmpfile, 1, len-4)
    tmpmap <- paste(tmpmap, ".map", sep="")
    file.copy(tmpmap, out, overwrite=TRUE)
  }

  # Read the map file
  x <- matrix(scan(out, what="character", sep=" "), byrow=TRUE, ncol=4)

  # Add locations
  if (!is.null(info.list)) {
    info.list <- default.list(info.list, c("id.var", "loc.var"), list("ERROR", "ERROR"), error=c(1, 1))
    info.list <- check.file.list(info.list)
    id.var    <- info.list$id.var
    y <- loadData.table(info.list)
    temp <- x[, 2] %in% y[, id.var]
    ids  <- match(x[, 2], y[, id.var])
    ids  <- ids[!is.na(ids)]
    x[temp, 4] <- y[ids, info.list$loc.var]
  }

  if (op$map.type) {
    x <- x[, c(2, 4)]
  }

  write.table(x, file=out, sep=" ", row.names=FALSE, quote=FALSE, col.names=FALSE)

  if (temp.list$delete) {
    file.remove(tmpids)
    file.remove(tmpfile)
    file.remove(tmpmap)
  }

}

glu.LD.snps <- function(snps.file, geno.file, out.dir, qc.file, subs.file=NULL, GLU="GLU", 
                 id.str="", delta=200000, delete=1, snp.var="LOCUS", chr.var="CHROMOSOME", loc.var="LOCATION") {

  tmp.snps <- paste(out.dir, "tmp.snps", id.str, sep="")
  tmp.glu  <- paste(out.dir, "tmp.glu", id.str, sep="")
  out.file <- paste(out.dir, "GLU.LD.snps", id.str, ".txt.xls", sep="")
  out.max  <- paste(out.dir, "GLU.R2.max", id.str, ".txt.xls", sep="")


  # Get the complete set of snps
  x    <- loadData.table(qc.file)
  keep <- c(snp.var, loc.var, chr.var)
  x    <- x[, keep]
  
  all  <- NULL
  snps <- scan(snps.file, what="character")
  temp <- x[, snp.var] %in% snps
  y    <- x[temp, , drop=FALSE]
  LOC  <- as.numeric(x[, loc.var])
  CHR  <- x[, chr.var]

  for (i in 1:nrow(y)) {
    snp  <- y[i, snp.var]
    chr  <- y[i, chr.var]
    loc  <- as.numeric(y[i, loc.var])
    temp <- (CHR %in% chr) & (LOC >= loc - delta) & (LOC <= loc + delta)
    temp[is.na(temp)] <- FALSE
    n <- sum(temp)
    print(c(snp, chr, loc, n))
    if (!n) next
    all <- unique(c(all, x[temp, snp.var]))
  }
  n <- length(all)
  if (!n) stop("ERROR: no SNPs")
  write(all, file=tmp.snps, ncolumns=1)

  # Call GLU
  str <- paste(GLU, " tagzilla --skipbinning -r 0 -m 9999 -a 0.04", sep="")
  str <- paste(str, " --saveldpairs=", tmp.glu, sep="")
  str <- paste(str, " --includeloci=", tmp.snps, sep="")
  if (!is.null(subs.file)) str <- paste(str, " --includesamples=", subs.file, sep="")
  str <- paste(str, " ", geno.file, sep="")
  print(str)
  callOS(str)
  
  # Read in and edit the glu output
  x    <- read.table(tmp.glu, header=1, sep="\t", as.is=TRUE)
  temp <- ((x[, "LNAME1"] %in% snps) | (x[, "LNAME2"] %in% snps)) & (x[, "LNAME1"] != x[, "LNAME2"])
  temp[is.na(temp)] <- FALSE
  x    <- x[temp, , drop=FALSE]

  # Put snps in column1
  temp <- (x[, "LNAME2"] %in% snps)
  if (any(temp)) {
    tsnp <- x[temp, "LNAME1"]
    x[temp, "LNAME1"] <- x[temp, "LNAME2"]
    x[temp, "LNAME2"] <- tsnp
  }
  x <- sort2D(x, "RSQUARED", fun=as.numeric, dec=TRUE)
  x <- sort2D(x, "LNAME1")
 
  writeTable(x, out.file)

  # Maximum R^2 for each snp, x is sorted
  dup <- duplicated(x[, "LNAME1"])
  y   <- x[!dup, , drop=FALSE]
  writeTable(y, out.max)
  

  # Delete file
  if (delete) {
    file.remove(tmp.snps) 
    file.remove(tmp.glu)
  }

  x

} # END: glu.LD.snps

# Function to extract snps
glu.extract <- function(chr.vec, start.vec, end.vec, geno.file, out.dir, qc.file, subs.file=NULL, GLU="GLU", out.format="sdat", gzip=1,
                 id.str="", delta=20000, delete=1, snp.var="LOCUS", chr.var="CHROMOSOME", loc.var="LOCATION") {

  tmp.snps <- paste(out.dir, "tmp.snps", id.str, sep="")
  out.file <- paste(out.dir, "snps", id.str, ".", out.format, sep="")

  # Get the complete set of snps
  x    <- loadData.table(qc.file)
  keep <- c(snp.var, loc.var, chr.var)
  x    <- x[, keep]
  temp <- x[, chr.var] %in% chr.vec
  x    <- x[temp, ]
  
  all  <- NULL
  LOC  <- as.numeric(x[, loc.var])
  CHR  <- x[, chr.var]
  print(x[1:10, ])
#print(sort(LOC))
  for (i in 1:length(chr.vec)) {
    chr  <- chr.vec[i]
    a    <- start.vec[i]
    b    <- end.vec[i] 
    temp <- (CHR %in% chr) & (LOC >= a - delta) & (LOC <= b + delta)
    temp[is.na(temp)] <- FALSE
    n <- sum(temp)
    print(c(chr, a, b, n))
    if (!n) next
    all <- unique(c(all, x[temp, snp.var]))
  }
  n <- length(all)
  if (!n) stop("ERROR: no SNPs")
  write(all, file=tmp.snps, ncolumns=1)

  # Call GLU
  str <- paste(GLU, " transform -F ", out.format, sep="")
  str <- paste(str, " --includeloci=", tmp.snps, sep="")
  if (!is.null(subs.file)) str <- paste(str, " --includesamples=", subs.file, sep="")
  str <- paste(str, " -o ", out.file, " ",  geno.file, sep="")
  print(str)
  callOS(str)
  
  if (gzip) {
    str <- paste("gzip ",  out.file, sep="")
    print(str)
    callOS(str)
  }

  # Delete file
  if (delete) {
    file.remove(tmp.snps) 
  }

  NULL

} # END: glu.extract

Try the CGEN package in your browser

Any scripts or data that you put into this service are public.

CGEN documentation built on April 28, 2020, 8:08 p.m.