.probToDosage <- function(probs, BB=TRUE, prob.miss.val=NULL) {
if (BB & ncol(probs) %% 3 != 0) stop("invalid probability file - there are not 3 columns per row")
if (!BB & ncol(probs) %% 2 != 0) stop("invalid probability file - there are not 2 columns per row")
## check for missing values here while it's still character strings
## if AA==AB==BB, set to missing
if (BB) {
AAprob <- probs[,c(TRUE,FALSE,FALSE),drop=FALSE]
ABprob <- probs[,c(FALSE,TRUE,FALSE),drop=FALSE]
BBprob <- probs[,c(FALSE,FALSE,TRUE),drop=FALSE]
# check for missing strings
## calculate A allele dosage
i <- AAprob == ABprob & AAprob == BBprob
mode(AAprob) <- "numeric"
mode(ABprob) <- "numeric"
mode(BBprob) <- "numeric"
dosage <- (2*AAprob + ABprob) / (AAprob + ABprob + BBprob)
dosage[i] <- NA
} else {
AAprob <- probs[,c(TRUE,FALSE),drop=FALSE]
ABprob <- probs[,c(FALSE,TRUE),drop=FALSE]
# check for missing strings
## calculate A allele dosage
# assumes no missing, or at least, a dosage that doesn't give you -1.
# no normalization either, since we don't have the full set of probabilities
mode(AAprob) <- "numeric"
mode(ABprob) <- "numeric"
dosage <- 2*AAprob + ABprob
}
return(dosage)
}
.probToGenotype <- function(probs, BB=TRUE, prob.threshold=0.9) {
if (BB & ncol(probs) %% 3 != 0) stop("invalid probability file - there are not 3 columns per row")
if (!BB & ncol(probs) %% 2 != 0) stop("invalid probability file - there are not 2 columns per row")
## check for missing values here while it's still character strings
## if AA==AB==BB, set to missing
if (BB) {
AAprob <- probs[,c(TRUE,FALSE,FALSE),drop=FALSE]
ABprob <- probs[,c(FALSE,TRUE,FALSE),drop=FALSE]
BBprob <- probs[,c(FALSE,FALSE,TRUE),drop=FALSE]
# check for missing strings
## calculate A allele dosage
i <- AAprob == ABprob & AAprob == BBprob
mode(AAprob) <- "numeric"
mode(ABprob) <- "numeric"
mode(BBprob) <- "numeric"
} else {
AAprob <- probs[,c(TRUE,FALSE),drop=FALSE]
ABprob <- probs[,c(FALSE,TRUE),drop=FALSE]
# check for missing strings
## calculate A allele dosage
# assumes no missing, or at least, a dosage that doesn't give you -1.
# no normalization either, since we don't have the full set of probabilities
mode(AAprob) <- "numeric"
mode(ABprob) <- "numeric"
BBprob <- 1 - AAprob - ABprob
}
sel.AA <- AAprob > ABprob & AAprob > BBprob & AAprob > prob.threshold
sel.AB <- ABprob > AAprob & ABprob > BBprob & ABprob > prob.threshold
sel.BB <- BBprob > AAprob & BBprob > ABprob & BBprob > prob.threshold
genotype <- matrix(NA, ncol=ncol(AAprob), nrow=nrow(AAprob))
genotype[sel.AA] <- 2
genotype[sel.AB] <- 1
genotype[sel.BB] <- 0
return(genotype)
}
imputedDosageFile <- function(input.files, filename, chromosome,
input.type=c("IMPUTE2", "BEAGLE", "MaCH"),
input.dosage=FALSE,
output.type=c("dosage", "genotype"),
file.type=c("gds", "ncdf"),
snp.annot.filename="dosage.snp.RData",
scan.annot.filename="dosage.scan.RData",
compress="LZMA_RA:1M",
compress.annot="LZMA_RA",
genotypeDim="scan,snp",
scan.df=NULL,
snp.exclude=NULL,
snp.id.start=1,
block.size=5000,
prob.threshold=0.9,
verbose=TRUE) {
# arguments: input type (impute2, beagle, mach), probs or dosages
input.type <- match.arg(input.type)
# return an error if the inputs are dosages but the desired outputs are genotypes
output.type <- match.arg(output.type)
if (input.dosage == TRUE & output.type == "genotype") {
stop("genotypes cannot be calculated from dosages")
}
## get file type
file.type <- match.arg(file.type)
# check snp,scan and filetype
if (genotypeDim != "snp,scan" & file.type == "ncdf") stop("ncdf files must be snp,scan")
# determine number of SNPs and samples
if (verbose) message("Determining number of SNPs and samples...")
Cnt <- count.fields(input.files[1])
line.cnt <- length(Cnt)
col.cnt <- max(Cnt)
if (input.type == "IMPUTE2") {
nsnp <- line.cnt
nsamp <- (col.cnt-5)/3
}
if (input.type == "BEAGLE") {
nsnp <- line.cnt - 1
if (input.dosage) nsamp <- col.cnt-3 else nsamp <- (col.cnt-3)/3
}
if (input.type == "MaCH") {
if (input.dosage) nsnp <- col.cnt-2 else nsnp <- (col.cnt-2)/2
nsamp <- line.cnt
}
# assign snp IDs
if (!is.null(snp.exclude)){
# check type
if (!is.numeric(snp.exclude)) stop("snp.exclude must be list of integers to include")
snpID <- 1:(nsnp - length(snp.exclude))
} else {
if (verbose) message("Including all SNPs.")
snpID <- 1:nsnp
snp.exclude <- numeric()
#snp.names <- NA
}
snp.df <- data.frame(snpID=as.integer(snpID + (snp.id.start-1)))
# scan.df - input checking or creation
if (!is.null(scan.df) & !("sampleID" %in% names(scan.df))) {
stop("sampleID required in scan.df.")
if (!("scanID" %in% names(scan.df))) scan.df$scanID <- 1:nrow(scan.df)
} else if (!is.null(scan.df) & !("scanID" %in% names(scan.df))) {
if (verbose) message("scanID mapping not given. Assigning scanIDs automatically.")
scan.df$scanID <- 1:nsamp
} else if (is.null(scan.df)) {
if (verbose) message("scan.df not given. Assigning scanIDs automatically.")
scan.df <- data.frame(scanID=1:nsamp, stringsAsFactors=FALSE)
scan.df$sampleID <- NA
}
# order by scanID -- do we want this?
#scan.df$scanID <- as.integer(scan.df$scanID) # convert to integer
#scan.df <- scan.df[order(scan.df$scanID), , drop=FALSE]
scan.df$added <- FALSE # variable if added to the gds file successfully
#if (nrow(scan.df) != nsamp) stop("Number of samples in scan.df is different than number of samples in file.")
# add some checking so we can use our own scan ids (and our own snp ids?)
#snpID <- 1:nsnp
#scanID <- 1:nsamp
## create data file
if (output.type == "dosage") {
precision <- "single"
miss.val <- -1
} else if (output.type == "genotype") {
precision <- "bit2"
if (file.type == "gds") {
miss.val <- 3
} else if (file.type == "ncdf") {
miss.val <- -1
}
}
if (file.type == "gds") {
gfile <- .createGdsDosage(snp.df, scan.df, filename, genotypeDim, miss.val, precision, compress.annot)
} else if (file.type == "ncdf") {
if (!(requireNamespace("ncdf4"))) {
stop("please install ncdf4 to work with NetCDF files")
}
gfile <- .createNcdfDosage(snp.df, scan.df, filename, miss.val, precision)
}
# read input file(s)
# valid for GDS and NCDF mostly.
if (input.type == "IMPUTE2") {
if (input.dosage) stop("input.dosage=TRUE not valid for input.type=IMPUTE2")
# .samples file
if (verbose) message ("Reading sample file...")
samp.header <- scan(input.files[2], what=character(), nlines=1, quiet=TRUE)
samples <- fread(input.files[2], stringsAsFactors=FALSE, header=FALSE, skip=2, data.table=FALSE)
names(samples) <- samp.header
if (nrow(samples) != nsamp) stop("Sample number mismatch: ", nsamp, " in genotype file; ", nrow(samples), "in samples file")
# check for unique ids
samples$ID <- paste(samples$ID_1, samples$ID_2)
if (any(duplicated(samples$ID))) stop("Sample ID is duplicated in input sample file")
# select samples
if (all(is.na(scan.df$sampleID))) {
i_samp <- 1:nrow(samples)
scan.df$sampleID <- samples$ID
} else {
i_samp <- match(scan.df$sampleID, samples$ID)
}
scan.df <- cbind(scan.df, samples[i_samp, !(names(samples) %in% "ID"),])
# empty matrix for SNP data
snps <- matrix("", nrow=nsnp, ncol=5)
# set up snp df columns
snp.df[, c("snp", "rsID", "position", "alleleA", "alleleB")] <- NA
# .gens file
if (verbose) message ("Reading genotype file...")
opfile <- file(input.files[1], "r")
cnt <- 1 # tracks where we are in the dosage file, snp dimension
i_snp <- 1 # tracks where we are in the gds file, snp dimension
while (length(ss <- scan(opfile, what=character(), quiet=TRUE, nlines=block.size)) > 0) {
if (verbose) message("Block ", ceiling(cnt/block.size), " of ", ceiling(nsnp/block.size))
dat <- matrix(ss, ncol=col.cnt, byrow=TRUE) # snps are the first column
# selected snps
dat <- dat[!((1:nrow(dat)+cnt-1) %in% snp.exclude), , drop=FALSE]
# check that all are selected
if (nrow(dat) < 1){
cnt <- cnt + block.size
next
}
# add snp info to annotation
snp.df$snp[i_snp : (i_snp+nrow(dat)-1)] <- dat[, 1]
snp.df$rsID[i_snp : (i_snp+nrow(dat)-1)] <- dat[, 2]
snp.df$position[i_snp : (i_snp+nrow(dat)-1)] <- as.integer(dat[, 3])
snp.df$alleleA[i_snp : (i_snp+nrow(dat)-1)] <- dat[, 4]
snp.df$alleleB[i_snp : (i_snp+nrow(dat)-1)] <- dat[, 5]
# get dosage info
dat <- dat[, 6:ncol(dat), drop=FALSE]
if (output.type == "dosage") {
dosage <- .probToDosage(dat)
} else if (output.type == "genotype") {
dosage <- .probToGenotype(dat, prob.threshold=prob.threshold)
} else {
stop("output.type needs to be either dosage or genotype")
}
if (ncol(dosage) != nsamp) stop("number of dosage columns not equal to number of samples in file")
# subset dosage to match this set of samples, snp subsetting was already taken care of
dosage <- dosage[, i_samp, drop=FALSE]
if(genotypeDim == "snp,scan") {
start <- c(i_snp, 1)
count <- c(nrow(dosage), ncol(dosage))
} else if (genotypeDim == "scan,snp") {
start <- c(1, i_snp)
count <- c(ncol(dosage), nrow(dosage))
dosage <- t(dosage)
}
# check for missing values
dosage[dosage < 0 | dosage > 2 | is.na(dosage)] <- miss.val
.addDosage(gfile, dosage, start=start, count=count)
cnt <- cnt + block.size
if (genotypeDim == "snp,scan"){
i_snp <- i_snp + nrow(dosage)
} else if (genotypeDim == "scan,snp"){
i_snp <- i_snp + ncol(dosage)
}
}
close(opfile)
scan.df$added[scan.df$sampleID %in% samples$ID] <- TRUE
}
if (input.type == "BEAGLE") {
# .markers file
if (verbose) message ("Reading SNP file...")
snps <- fread(input.files[2], stringsAsFactors=FALSE, header=FALSE, data.table=FALSE)
names(snps) <- c("marker", "position", "alleleA", "alleleB")
if (nrow(snps) != nsnp) stop("SNP number mismatch: ", nsnp, " in genotype file; ", nrow(snps), "in markers file")
# .dose or .gprobs file
if (verbose) message ("Reading genotype file...")
# sample names in header
opfile <- file(input.files[1], open="r")
header <- scan(opfile, what=character(), quiet=TRUE, nlines=1)
header <- header[4:length(header)]
if (input.dosage) sample.id <- header else sample.id <- header[c(TRUE,FALSE,FALSE)]
samples <- data.frame("ID"=sample.id, stringsAsFactors=FALSE)
if (all(is.na(scan.df$sampleID))) {
i_samp <- 1:nrow(samples)
scan.df$sampleID <- samples$ID
} else {
i_samp <- match(scan.df$sampleID, samples$ID)
}
# set up snp annotation
snp.df[, c("snp", "position", "alleleA", "alleleB")] <- NA
cnt <- 1 # tracks location in the impute file
i_snp <- 1 # tracks location in the gds file
while (length(ss <- scan(opfile, what=character(), quiet=TRUE, nlines=block.size)) > 0) {
if (verbose) message("Block ", ceiling(cnt/block.size), " of ", ceiling(nsnp/block.size))
dat <- matrix(ss, ncol=col.cnt, byrow=TRUE)
if (!allequal(snps[cnt:(cnt+nrow(dat)-1),c(1,3,4)], dat[,1:3])) stop ("markers file does not match genotype file")
# selected snps
dat <- dat[!((1:nrow(dat)+cnt-1) %in% snp.exclude), , drop=FALSE]
# check that all are selected
if (nrow(dat) < 1){
cnt <- cnt + block.size
next
}
# add snp info to annotation
snp.df$snp[i_snp : (i_snp + nrow(dat) - 1)] <- dat[, 1]
snp.df$alleleA[i_snp : (i_snp + nrow(dat) - 1)] <- dat[, 2]
snp.df$alleleB[i_snp : (i_snp + nrow(dat) - 1)] <- dat[, 3]
dat <- dat[, 4:ncol(dat), drop=FALSE]
if (input.dosage) {
# BEAGLE has B allele dosage
mode(dat) <- "numeric"
dosage <- 2 - dat
} else {
if (output.type == "dosage") {
dosage <- .probToDosage(dat)
} else if (output.type == "genotype") {
dosage <- .probToGenotype(dat, prob.threshold=prob.threshold)
}
}
if (ncol(dosage) != nsamp) stop("number of dosage columns not equal to number of samples")
dosage <- dosage[, i_samp, drop=FALSE]
if(genotypeDim == "snp,scan") {
start <- c(i_snp, 1)
count <- c(nrow(dosage), -1)
} else if (genotypeDim == "scan,snp") {
start <- c(1, i_snp)
count <- c(-1, nrow(dosage))
dosage <- t(dosage)
}
# check for missing values
dosage[dosage < 0 | dosage > 2 | is.na(dosage)] <- miss.val
.addDosage(gfile, dosage, start=start, count=count)
cnt <- cnt + block.size
if (genotypeDim == "snp,scan"){
i_snp <- i_snp + nrow(dosage)
} else if (genotypeDim == "scan,snp"){
i_snp <- i_snp + ncol(dosage)
}
}
close(opfile)
# keep track of added samples
scan.df$added[scan.df$sampleID %in% samples$ID] <- TRUE
# add position
snp.df$position <- snps$position[!((1:nrow(snps)) %in% snp.exclude)]
}
if (input.type == "MaCH") {
# .mlinfo file
if (verbose) message ("Reading SNP files...")
snps <- fread(input.files[2], stringsAsFactors=FALSE, header=TRUE, data.table=FALSE)
snps <- snps[,1:3]
names(snps) <- c("snp", "alleleA", "alleleB")
if (nrow(snps) != nsnp) stop("SNP number mismatch: ", nsnp, " in genotype file; ", nrow(snps), "in mlinfo file")
# get snps to exclude
i_snp <- !((1:nsnp) %in% snp.exclude)
snp.df[, c("snp", "alleleA", "alleleB")] <- NA
snp.df$snp <- snps$snp[i_snp]
snp.df$alleleA <- snps$alleleA[i_snp]
snp.df$alleleB <- snps$alleleB[i_snp]
# file with SNP positions
snp2 <- fread(input.files[3], stringsAsFactors=FALSE, header=TRUE, data.table=FALSE)
snp2 <- snp2[,c("SNP", "position")]
names(snp2) <- c("snp", "position")
if (!setequal(snps$snp, snp2$snp)) stop("SNP column in files ", input.files[2], " and ", input.files[3], " do not contain the same values")
#ord <- snps$SNP
#snp.df <- merge(snp.df, snp2, sort=FALSE, all.x=TRUE)
#snp.df <- snp.df[order(snp.df$snpID),]
snp.df$position <- snp2$position[i_snp]
# .mldose or .mlprob file
if (verbose) message ("Reading genotype file...")
opfile <- file(input.files[1], "r")
cnt <- 1
while (length(ss <- scan(opfile, what=character(), quiet=TRUE, nlines=block.size)) > 0) {
if (verbose) message("Block ", ceiling(cnt/block.size), " of ", ceiling(nsamp/block.size))
dat <- matrix(ss, ncol=col.cnt, byrow=TRUE)
#samp.dat[cnt:(cnt+nrow(dat)-1)] <- dat[,1]
samp.block <- dat[,1]
dat <- dat[,3:ncol(dat),drop=FALSE]
if (input.dosage) {
mode(dat) <- "numeric"
dosage <- t(dat)
} else {
if (output.type == "dosage") {
dosage <- t(.probToDosage(dat, BB = FALSE))
} else if (output.type == "genotype") {
dosage <- t(.probToGenotype(dat, BB = FALSE, prob.threshold=prob.threshold))
}
}
if (nrow(dosage) != nsnp) stop("number of dosage rows not equal to number of SNPs")
# loop over samples to add them. Lots of indices here:
# i_dos tracks the location in the dosage array
# i_samp finds the location in the gds file/scan.df
# i_snp matches the ordering of snps in the dosage array to the ordering of snps in the gds file
for (i_dos in 1:length(samp.block)) {
# this sample
samp <- samp.block[i_dos]
# check if sampleID has been given
if (all(!is.na(scan.df$sampleID)) & (samp %in% scan.df$sampleID)) {
# add the genotype
i_samp <- which(scan.df$sampleID %in% samp)
} else if (any(is.na(scan.df$sampleID)) & !(samp %in% scan.df$sampleID)) {
# take the first missing sampleID
i_samp <- min(which(is.na(scan.df$sampleID)))
scan.df$sampleID[i_samp] <- samp
} else {
# something failed?
next
}
# get dosage for that sample and reorder to match gds snp ordering.
dos.samp <- dosage[i_snp, i_dos]
# check for missing values
dos.samp[dos.samp < 0 | dos.samp > 2 | is.na(dos.samp)] <- miss.val
if (genotypeDim == "snp,scan"){
start <- c(1, i_samp)
count <- c(-1, 1)
} else if (genotypeDim == "scan,snp") {
start <- c(i_samp, 1)
count <- c(1, -1)
}
# write all snps for that sample
.addDosage(gfile, dos.samp, start=start, count=count)
scan.df$added[i_samp] <- TRUE
cnt <- cnt+1
}
}
close(opfile)
}
# zero out samples that haven't been added
i_zero <- which(!scan.df$added)
for (i_samp in i_zero){
if (genotypeDim == "snp,scan"){
start <- c(1, i_samp)
count <- c(-1, 1)
} else if (genotypeDim == "scan,snp") {
start <- c(i_samp, 1)
count <- c(1, -1)
}
dos.samp <- rep(-1, nrow(snp.df))
# write all snps for that sample
.addDosage(gfile, dos.samp, start=start, count=count)
}
## # compress (if gds)
## if (file.type == "gds" & compress != "") {
## if (verbose) message("Compressing...")
## .compressDosage(gfile, compress)
## }
# set up annotation
#samples$scanID <- 1:nrow(samples) ## this will need to change.
#if ("sex" %in% names(samples)) names(samples)[names(samples) %in% "sex"] <- "sex.sample"
if ("sex" %in% names(scan.df)) {
if (!all(scan.df$sex %in% c("M","F"))) {
if (all(scan.df$sex %in% c(1,2))) {
scan.df$sex <- c("1"="M", "2"="F")[as.character(scan.df$sex)]
} else {
names(scan.df)[names(scan.df) %in% "sex"] <- "sex.sample"
}
}
}
scanAnnot <- ScanAnnotationDataFrame(scan.df)
#snps$snpID <- 1:nrow(snps)
# convert chromosome type
xchr.str <- c(1:22, "X", "Y", "XY", "MT", "M", 23, 24, 25, 26)
xchr <- as.integer(c(1:22, 23, 25, 24, 26, 26, 23, 24, 25, 26))
snp.df$chromosome <- xchr[match(chromosome, xchr.str)]
snp.df$position <- as.integer(snp.df$position)
snpAnnot <- SnpAnnotationDataFrame(snp.df)
# add variable data
if (verbose) message("Writing annotation...")
.addSnpVars(gfile, snpAnnot, compress.annot)
# save annotation
save(snpAnnot, file=snp.annot.filename)
save(scanAnnot, file=scan.annot.filename)
# compress (if gds)
if (file.type == "gds" & compress != "") {
if (verbose) message("Compressing...")
gfile <- .reopenGds(gfile)
.compressDosage(gfile, compress)
}
# close file
.close(gfile, verbose=verbose)
return(invisible(NULL))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.