Nothing
#' Fit Cox survival to all variants from PLINK binary files (.BED, .BIM, .FAM)
#'
#' Performs survival analysis using Cox proportional hazard models on directly
#' typed data in PLINK format
#'
#' @param bed.file character of name of plink files without extension
#' @param covariate.file data.frame comprising phenotype information, all
#' covariates to be added in the model must be numeric.
#' @param id.column character giving the name of the ID column
#' in covariate.file.
#' @param sample.ids character vector of sample IDs to keep in
#' survival analysis
#' @param time.to.event character of column name in covariate.file that
#' represents the time interval of interest in the analysis
#' @param event character of column name in covariate.file that represents
#' the event of interest to be included in the analysis
#' @param covariates character vector with exact names of columns in
#' covariate.file to include in analysis
#' @param inter.term character string giving the column name of the covariate
#' that will be added to the interaction term with
#' SNP (e.g. \code{term*SNP}). See details.
#' @param print.covs character string of either \code{"only"}, \code{"all"}
#' or \code{"some"}, defining which covariate statistics should be printed to
#' the output. See details.
#' @param out.file character of output file name (do not include extension)
#' @param chunk.size integer number of variants to process per thread
#' @param maf.filter numeric to filter minor allele frequency
#' (i.e. choosing 0.05 means filtering MAF>0.05). User can set this to
#' \code{NULL} if no filtering is preffered. Default is 0.05.
#' @param flip.dosage logical to flip which allele the dosage was
#' calculated on, default \code{flip.dosage=TRUE}
#' @param verbose logical for messages that describe which part of the
#' analysis is currently being run
#' @param clusterObj A cluster object that can be used with the
#' \code{parApply} function. See details.
#'
#' @details
#' Testing for SNP-covariate interactions:
#' User can define the column name of the covariate that will be included in
#' the interaction term.
#' For example, for given covariates \code{a} and \code{b}, where \code{c} is
#' defined as the \code{inter.term} the model will be:
#' \code{~ a + b + c + SNP + c*SNP}.
#'
#' Printing results of other covariates:
#' \code{print.covs} argument controls the number of covariates will be printed
#' as output. The function is set to \code{only}
#' by default and will only print the SNP or if an interaction term is given,
#' the results of the interaction
#' term (e.g. \code{SNP*covariate}). Whereas, \code{all} will print results
#' (coef, se.coef, p.value etc) of all covariates included in the model.
#' \code{some} is only applicable if an interaction term is given and will
#' print the results for SNP, covariate tested for interaction and the
#' interaction term. User should be mindful about using the \code{all} option,
#' as it will likely slow down the analysis and will increase
#' the output file size.
#'
#' User defined parallelization:
#' This function uses \code{parApply} from \code{parallel} package to fit
#' models to SNPs in parallel.
#' User is not required to set any options for the parallelization.
#' However, advanced users who wish to optimize it, can provide a
#' cluster object generated by \code{makeCluster} family of functions that
#' suits their need and platform.
#'
#' @return
#' Saves two text files directly to disk:
#' \code{.coxph} extension containing CoxPH survival analysis results.
#' \code{.snps_removed} extension containing SNPs that were removed
#' due to low variance or user-defined thresholds.
#'
#' @examples
#' bed.file <- system.file(package="gwasurvivr",
#' "extdata",
#' "plink_example.bed")
#' covariate.file <- system.file(package="gwasurvivr",
#' "extdata",
#' "simulated_pheno.txt")
#' covariate.file <- read.table(covariate.file,
#' sep=" ",
#' header=TRUE,
#' stringsAsFactors = FALSE)
#' covariate.file$SexFemale <- ifelse(covariate.file$sex=="female", 1L, 0L)
#' sample.ids <- covariate.file[covariate.file$group=="experimental",]$ID_2
#' plinkCoxSurv(bed.file=bed.file,
#' covariate.file=covariate.file,
#' id.column="ID_2",
#' sample.ids=sample.ids,
#' time.to.event="time",
#' event="event",
#' covariates=c("age", "SexFemale", "DrugTxYes"),
#' inter.term=NULL,
#' print.covs="only",
#' out.file="impute_example",
#' chunk.size=50,
#' maf.filter=0.005,
#' flip.dosage=TRUE,
#' verbose=TRUE,
#' clusterObj=NULL)
#'
#' @importFrom survival Surv coxph.fit
#' @importFrom matrixStats rowMeans2 rowVars rowSds
#' @importFrom SummarizedExperiment rowRanges
#' @importFrom utils write.table
#' @importFrom stats pnorm rnorm setNames complete.cases
#' @importFrom SNPRelate snpgdsBED2GDS
#' @import parallel
#' @import GWASTools
#'
#' @export
plinkCoxSurv <- function(bed.file,
covariate.file,
id.column,
sample.ids=NULL,
time.to.event,
event,
covariates,
inter.term=NULL,
print.covs="only",
out.file,
chunk.size=10000,
maf.filter=0.005,
flip.dosage=TRUE,
verbose=TRUE,
clusterObj=NULL)
{
bed.file <- bed.file
bim.file <- sub("\\.[^.]*?$", ".bim", bed.file)
fam.file <- sub("\\.[^.]*?$", ".fam", bed.file)
###################################
#### Phenotype data wrangling #####
cox.params <- coxPheno(covariate.file,
covariates,
id.column,
inter.term,
time.to.event,
event,
sample.ids,
verbose)
###################################
###################################
##### Generate cluster obj ########
# create cluster object depending on
# user pref or OS type, also create
#option to input number of cores
if(!is.null(clusterObj)){
cl <- clusterObj
}else if(.Platform$OS.type == "unix") {
cl <- makePSOCKcluster(getOption("gwasurvivr.cores", 2L))
} else {
cl <- makeCluster(getOption("gwasurvivr.cores", 2L))
}
on.exit(stopCluster(cl), add=TRUE)
gdsfile <- tempfile(pattern="", fileext = ".gds")
comp_time <- system.time(snpgdsBED2GDS(bed.file,
fam.file,
bim.file,
gdsfile,
cvt.chr="int",
cvt.snpid="int",
verbose=TRUE)
)
message("***** Compression time ******\n",
"User:", round(comp_time[1], 3),
"\nSystem: ", round(comp_time[2], 3),
"\nElapsed: ", round(comp_time[3], 3),
"\n*****************************"
)
gds <- GdsGenotypeReader(gdsfile,
YchromCode=24L,
XYchromCode=25L)
scanID <- getScanID(gds)
scanAnnot <- ScanAnnotationDataFrame(data.frame(scanID,
stringsAsFactors=FALSE))
snpID <- getSnpID(gds)
chromosome <- getChromosome(gds)
position <- getPosition(gds)
alleleA <- getAlleleA(gds)
alleleB <- getAlleleB(gds)
rsID <- getVariable(gds, "snp.rs.id")
# requires snpID
snpAnnot <- SnpAnnotationDataFrame(data.frame(snpID,
rsID,
chromosome,
position,
alleleA,
alleleB,
stringsAsFactors=FALSE),
YchromCode=24L,
XYchromCode=25L)
genoData <- GenotypeData(gds,
scanAnnot=scanAnnot,
snpAnnot=snpAnnot)
# set up columns for output
cols <- c("RSID",
"CHR",
"POS",
"A0",
"A1",
"exp_freq_A1",
"SAMP_MAF")
write.table( t(cols),
paste0(out.file, ".snps_removed"),
row.names = FALSE,
col.names=FALSE,
sep="\t",
quote = FALSE,
append = FALSE)
snp.df <- data.frame(t(rep(NA, 7)))
colnames(snp.df) <- cols
rownames(snp.df) <- NULL
snp.spike <- rbind(rnorm(nrow(cox.params$pheno.file)),
rnorm(nrow(cox.params$pheno.file)))
if(!is.null(inter.term)){
cox.out <- t(apply(snp.spike,
1,
survFitInt,
cox.params=cox.params,
cov.interaction=inter.term,
print.covs=print.covs)
)
res.cols <- colnames(coxExtract(cox.out,
snp.df,
print.covs=print.covs)
)
write.table( t(res.cols),
paste0(out.file, ".coxph"),
row.names = FALSE,
col.names=FALSE,
sep="\t",
quote = FALSE,
append = FALSE)
} else {
cox.out <- t(apply(snp.spike,
1,
survFit,
cox.params=cox.params,
print.covs=print.covs)
)
res.cols <- colnames(coxExtract(cox.out,
snp.df,
print.covs=print.covs)
)
write.table(t(res.cols),
paste0(out.file, ".coxph"),
row.names = FALSE,
col.names=FALSE,
sep="\t",
quote = FALSE,
append = FALSE)
}
# number of dropped snps
snp.drop.n <-0
snp.n <- 0
snp.start <- 1
snp.end <- nsnp(genoData)
# get genotypes for certain chunk size
nsnp.seg <- snp.end - snp.start + 1
nchunks <- ceiling(nsnp.seg/chunk.size)
for(i in seq_len(nchunks)){
if(verbose) message("Analyzing part ", i, "/", nchunks, "...")
# set up chunks
next.chunk <- (i-1)*chunk.size
next.chunk.start <- snp.start + next.chunk
snp.chunk <- ifelse(next.chunk.start + chunk.size > snp.end,
snp.end - next.chunk.start + 1,
chunk.size)
chunk.idx <- (next.chunk+1):(next.chunk+snp.chunk)
# get genotypes for chunk
genotypes <- getGenotype(genoData,
snp=c(next.chunk.start, snp.chunk),
scan=c(1,-1),
drop=FALSE)
# get the snp info file
snp <- getAnnotation(getSnpAnnotation(genoData))[chunk.idx,]
snp.cols <- c("snpID",
"RSID",
"CHR",
"POS",
"A0",
"A1")
snp.ord <- c("RSID",
"CHR",
"POS",
"A0",
"A1")
colnames(snp) <- snp.cols
snp <- snp[, snp.ord]
blankSNPs <- snp$A0 == "0" & snp$A1 == "0"
# grab sample file data
scanAnn <- getAnnotation(getScanAnnotation(genoData))
# assign rsIDs (pasted with imputation status) as rows
# and sample ID as columns to genotype file
dimnames(genotypes) <- list(snp$RSID,
scanAnn$scanID)
# Subset genotypes by given samples
genotypes <- genotypes[!blankSNPs,cox.params$ids]
snp <- snp[!blankSNPs,]
# flip dosage
if(flip.dosage) genotypes <- 2 - genotypes
########################################################################
########################################################################
##### SNP info and filtering ###########################################
### Check snps for MAF = 0 ###
# remove snps with SD less than 1e-4
# to put this in perspective:
# a sample size of 100 000 000 with only 1 person being 1 and rest 0,
# has an SD = 1e-4
# x <- c(rep(0, 1e8),1)
# sd(x)
ok.snp <- rowSds(genotypes, na.rm = TRUE) > 1e-4
snp <- snp[ok.snp, ]
genotypes <- genotypes[ok.snp, ]
snp.drop <- snp[!ok.snp, ]
# calculate MAF
snp$exp_freq_A1 <- round(rowMeans2(genotypes, na.rm = TRUE)*0.5,4)
snp$SAMP_MAF <- ifelse(snp$exp_freq_A1 > 0.5,
1-snp$exp_freq_A1,
snp$exp_freq_A1
)
# Further filter by user defined thresholds
if (!is.null(maf.filter)) {
ok.snp <- snp$SAMP_MAF > maf.filter
genotypes <- genotypes[ok.snp,]
snp <- snp[ok.snp,]
if(nrow(snp.drop) > 0){
snp.drop$exp_freq_A1 <- 1
snp.drop$SAMP_MAF <- 0
snp.drop <- rbind(snp.drop, snp[!ok.snp,])
} else {
snp.drop <- snp[!ok.snp,]
}
}
if (nrow(snp.drop) > 0) {
write.table(
snp.drop,
paste0(out.file, ".snps_removed"),
row.names = FALSE,
col.names = FALSE,
sep = "\t",
quote = FALSE,
append = TRUE )
snp.drop.n <- snp.drop.n+nrow(snp.drop)
}
if (nrow(genotypes) > 0) {
# fit models in parallel
if (is.null(inter.term)) {
if (is.matrix(genotypes)) {
cox.out <- t(
parallel::parApply(
cl = cl,
1L,
X = genotypes,
FUN = survFit,
cox.params = cox.params,
print.covs = print.covs
)
)
} else if(is.numeric(genotypes)) {
cox.out <- survFit(genotypes,
cox.params = cox.params,
print.covs = print.covs)
}
} else if (inter.term %in% covariates) {
if (is.matrix(genotypes)) {
cox.out <- t(
parApply(
cl = cl,
X = genotypes,
MARGIN = 1,
FUN = survFitInt,
cox.params = cox.params,
cov.interaction = inter.term,
print.covs = print.covs
)
)
} else if(is.numeric(genotypes)) {
cox.out <- survFitInt(
genotypes,
cox.params = cox.params,
cov.interaction = inter.term,
print.covs = print.covs
)
}
}
res <- coxExtract(cox.out,
snp,
print.covs)
write.table(
res,
file = paste0(out.file, ".coxph"),
sep = "\t",
quote = FALSE,
row.names = FALSE,
col.names = FALSE,
append = TRUE
)
snp.n <- nrow(genotypes) + snp.n
}
}
if(verbose) {
message(snp.drop.n," SNPs were removed from the analysis for not meeting\n",
"the given threshold criteria or for having MAF = 0")
}
if(verbose) message("List of removed SNPs are saved to ",
paste0(out.file, ".snps_removed"))
if(verbose) message("In total ", snp.n,
" SNPs were included in the analysis")
if(verbose) message("The Cox model results output was saved to ",
paste0(out.file, ".coxph"))
if (verbose) message("Analysis completed on ",
format(Sys.time(), "%Y-%m-%d"),
" at ",
format(Sys.time(), "%H:%M:%S"))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.