readIDAT_nonenc <- function(file, what = c("all", "IlluminaID", "nSNPsRead")) {
readByte <- function(con, n=1, ...) {
readBin(con, what="integer", n=n, size=1, endian="little", signed=FALSE)
}
readShort <- function(con, n=1, ...) {
readBin(con, what="integer", n=n, size=2, endian="little", signed=FALSE)
}
readInt <- function(con, n=1, ...) {
readBin(con, what="integer", n=n, size=4, endian="little", signed=TRUE)
}
readLong <- function(con, n=1, ...) {
readBin(con, what="integer", n=n, size=8, endian="little", signed=TRUE)
}
readBytesToRead <- function(con, ...) {
m <- readByte(con, n=1)
n <- m %% 128
shift <- 0L
while (m %/% 128 == 1) {
## Read next length byte ...
m <- readByte(con, n=1)
## ... which represents the next 7 hi-bits
shift <- shift + 7L
k <- (m %% 128) * 2^shift
## Total number of bytes to read
n <- n + k
}
## 'n' is a numeric; not an integer
n
}
readString <- function(con, ...) {
## From [1] https://code.google.com/p/glu-genetics/source/browse/glu/lib/illumina.py#86:
## String data are encoded as a sequence of one or more length
## bytes followed by the specified number of data bytes.
##
## The lower 7 bits of each length byte encodes the bits that
## comprise the length of the following byte string. When the
## most significant bit it set, then an additional length byte
## follows with 7 additional high bits to be added to the current
## length. The following string lengths are accommodated by
## increasing sequences of length bytes:
##
## length maximum
## bytes length
## ------ --------
## 1 127 B
## 2 16 KB
## 3 2 MB
## 4 256 MB
## 5 32 GB
##
## While this seems like a sensible progression, there is some
## uncertainty about this interpretation, since the longest of
## string observed in the wild has been of length 6,264 with
## two length bytes.
##
## EXAMPLES: by HB (2015-09-11)
## Length Len bytes Iterations (n, m, k, shift)
## 1 (1)
## 127 (127) -> n=128
## 128 (128,1) -> n=0 ,m=1 ,shift=7, k=128 -> n=128
## 255 (255,1) -> n=128,m=1 ,shift=7, k=128 -> n=255
## 256 (128,2) -> n=0 ,m=2 ,shift=7, k=256 -> n=256
## 257 (129,2) -> n=1 ,m=2 ,shift=7, k=256 -> n=257
## 512 (128,4) -> n=0 ,m=4 ,shift=7, k=512 -> n=512
## 81921 (129,128,5) -> n=1 ,m=128,shift=7, k=0 -> n=1+0=1
## -> n=0 ,m=5 ,shift=14,k=81920
## -> n=1+81920=81921
## Parse the number of characters to read
n <- readBytesToRead(con)
## Read the data bytes
readChar(con, nchars=n)
}
readUnknownBytes <- function(con, ...) {
## Parse the number of characters to read
n <- readBytesToRead(con)
## Read the data bytes
c(as.integer(n), readByte(con, n = n))
}
readField <- function(con, field) {
switch(field,
"IlluminaID" = readInt(con = con, n=nSNPsRead),
"SD" = readShort(con = con, n=nSNPsRead),
"Mean"= readShort(con = con, n=nSNPsRead),
"NBeads" = readByte(con = con, n=nSNPsRead),
"MidBlock" = {
nMidBlockEntries <- readInt(con = con, n=1)
MidBlock <- readInt(con = con, n=nMidBlockEntries)
},
"RedGreen" = readInt(con = con, n=1),
"MostlyNull" = readString(con = con),
"Barcode" = readString(con = con),
"ChipType" = readString(con = con),
"MostlyA" = readString(con = con),
"Unknown.1" = readString(con = con),
"Unknown.2" = readString(con = con),
"Unknown.3" = readString(con = con),
"Unknown.4" = readString(con = con),
"Unknown.5" = readString(con = con),
## We don't know what 'Unknown.6' should be, but we know of at
## least one instance [2] where using readString() would trigger
## a warning about nulls in the string stemming from the byte
## sequence (1, 0). Because of this, we choose to read it as
## a byte sequence based of readUnknownBytes(), because it's
## the closest to readString() which we have used for years.
## We might change how this field is parsed in a future version
## when better understood what this field contains, e.g.
## it could be that readShort() should be used. /HB 2022-09-23
## [2] https://github.com/HenrikBengtsson/illuminaio/issues/21
"Unknown.6" = readUnknownBytes(con = con),
"Unknown.7" = readString(con = con),
"RunInfo" = {
nRunInfoBlocks <- readInt(con = con, n=1)
naValue <- as.character(NA)
RunInfo <- matrix(naValue, nrow=nRunInfoBlocks, ncol=5)
colnames(RunInfo) <- c("RunTime", "BlockType", "BlockPars",
"BlockCode", "CodeVersion")
for (ii in seq_len(nRunInfoBlocks)) {
for (jj in 1:5) {
RunInfo[ii,jj] <- readString(con = con)
}
}
RunInfo
},
stop("readIDAT_nonenc: unknown field"))
}
if(! (is.character(file) || try(isOpen(file))))
stop("argument 'file' needs to be either a character or an open, seekable connection")
what <- match.arg(what)
if(is.character(file)) {
stopifnot(length(file) == 1)
file <- path.expand(file)
stopifnot(file.exists(file))
fileSize <- file.info(file)$size
if(grepl("\\.gz$", file))
con <- gzfile(file, "rb")
else
con <- file(file, "rb")
on.exit({
close(con)
})
} else {
con <- file
fileSize <- 0
}
if(!isSeekable(con))
stop("The file connection needs to be seekable")
## Assert file format
magic <- readChar(con, nchars=4)
if (magic != "IDAT") {
stop("Cannot read IDAT file. File format error. Unknown magic: ", magic)
}
## Read IDAT file format version
version <- readLong(con, n=1)
if (version < 3) {
stop("Cannot read IDAT file. Unsupported IDAT file format version: ", version)
}
## Number of fields
nFields <- readInt(con, n=1)
fields <- matrix(0L, nrow=nFields, ncol=3)
colnames(fields) <- c("fieldCode", "byteOffset", "Bytes")
for (ii in 1:nFields) {
fields[ii,"fieldCode"] <- readShort(con, n=1)
fields[ii,"byteOffset"] <- readLong(con, n=1)
}
## The below stems from [1]. However, there is now also [3], which seems to
## have identified a few more of the 'Unknowns' fields. /HB 2022-09-23
## [3] https://github.com/bioinformed/glu-genetics/blob/dcbbbf67a308d35e157b20a9c76373530510379a/glu/lib/illumina.py#L44-L61
knownCodes <- c(
"nSNPsRead" = 1000,
"IlluminaID" = 102,
"SD" = 103,
"Mean" = 104,
"NBeads" = 107,
"MidBlock" = 200,
"RunInfo" = 300,
"RedGreen" = 400,
"MostlyNull" = 401, # 'Manifest' [1,3]
"Barcode" = 402,
"ChipType" = 403,
"MostlyA" = 404, # 'Stripe' [1], 'label' [3]
"Unknown.1" = 405, # 'opa' [3]
"Unknown.2" = 406, # 'Sample ID' [1,3]
"Unknown.3" = 407, # 'descr' [3]
"Unknown.4" = 408, # 'Plate' [1,3]
"Unknown.5" = 409, # 'Well' [1,3]
"Unknown.6" = 410,
"Unknown.7" = 510 # 'unknown' [1,3]
)
nNewFields <- 1
rownames(fields) <- paste("Null", 1:nFields)
for (ii in 1:nFields) {
temp <- match(fields[ii,"fieldCode"], knownCodes)
if (!is.na(temp)) {
rownames(fields)[ii] <- names(knownCodes)[temp]
} else {
rownames(fields)[ii] <- paste("newField", nNewFields, sep=".")
nNewFields <- nNewFields + 1
}
}
stopifnot(min(fields[, "byteOffset"]) == fields["nSNPsRead", "byteOffset"])
seek(con, where=fields["nSNPsRead", "byteOffset"], origin="start")
nSNPsRead <- readInt(con, n=1)
if(what == "nSNPsRead")
return(nSNPsRead)
if(what == "IlluminaID") {
where <- fields["IlluminaID", "byteOffset"]
seek(con, where = where, origin = "start")
res <- readField(con = con, field = "IlluminaID")
return(as.character(res))
}
res <- rownames(fields)
names(res) <- res
res <- res[order(fields[res, "byteOffset"])]
res <- res[names(res) != "nSNPsRead"]
res <- lapply(res, function(xx) {
where <- fields[xx, "byteOffset"]
seek(con, where = where, origin = "start")
readField(con = con, field = xx)
})
Unknowns <- list(
MostlyNull=res$MostlyNull,
MostlyA=res$MostlyA
)
names <- grep("^Unknown[.]", names(res), value = TRUE)
Unknowns[names] <- res[names]
Quants <- cbind(res$Mean, res$SD, res$NBeads)
colnames(Quants) <- c("Mean", "SD", "NBeads")
rownames(Quants) <- as.character(res$IlluminaID)
list(
fileSize=fileSize,
versionNumber=version,
nFields=nFields,
fields=fields,
nSNPsRead=nSNPsRead,
Quants=Quants,
MidBlock=res$MidBlock,
RedGreen=res$RedGreen,
Barcode=res$Barcode,
ChipType=res$ChipType,
RunInfo=res$RunInfo,
Unknowns=Unknowns
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.