### LS Mar 2024 functions related to manifests and chip types
#' read.manifest - read in csv format Illumina chip manifest files
#'
#' @param file
#' @return a list of of dataframes of data prepared for making
#' IlluminaMethylationManifest
#' @details
#' This function is probably not much use for calling directly.
#' It mostly exists to be called by canno.
#'
read.manifest <- function(file) {
seps <- grep('^\\[', readLines(file) )
manifest <- read.csv(
file,
skip=seps[2],
nrows=seps[3]-seps[2]-2,
as.is=T
)
# with EPICv2 names are no longer unique!
manifest$Name -> manifest$oldName
manifest$Name <- manifest$IlmnID
stopifnot (exists('Infinium_Design_Type', manifest))
if (1 %in% names(table(manifest$Infinium_Design_Type))){
manifest$Infinium_Design_Type[manifest$Infinium_Design_Type==1] <- 'I'
manifest$Infinium_Design_Type[manifest$Infinium_Design_Type==2] <-'II'
}
TypeI <- manifest[
manifest$Infinium_Design_Type == "I",
c(
"Name",
"AddressA_ID",
"AddressB_ID",
"Color_Channel",
"AlleleB_ProbeSeq"
)
]
names(TypeI)[c(2, 3, 4)] <- c( "AddressA", "AddressB", "Color" )
TypeI <- as(TypeI, "DataFrame")
TypeI$ProbeSeqB <- DNAStringSet(TypeI$AlleleB_ProbeSeq)
TypeI$nCpG <- as.integer(
oligonucleotideFrequency(TypeI$ProbeSeqB, width = 2)[, "CG"] - 1L)
TypeI$nCpG[TypeI$nCpG < 0] <- 0L
# there are 295 probes including 30 cpg probes and nv- and
# rs- annotated SNPs with no CG
TypeI$Name <- as.character(TypeI$Name )
TypeI$AddressA <- as.character(TypeI$AddressA)
TypeI$AddressB <- as.character(TypeI$AddressB)
TypeI$Color <- as.character(TypeI$Color )
TypeI$nCpG <- as.integer (TypeI$nCpG )
TypeSnpI <- TypeI[grep("^rs", TypeI$Name), ]
# unlike minfi we want these in the main data set (as well)
TypeII <- manifest[
manifest$Infinium_Design_Type == "II",
c("Name", "AddressA_ID", "AlleleA_ProbeSeq")
]
names(TypeII)[c(2,3)] <- c("AddressA", "ProbeSeqA")
TypeII <- as(TypeII, "DataFrame")
TypeII$ProbeSeqA <- DNAStringSet(TypeII$ProbeSeqA)
TypeII$nCpG <- as.integer(
letterFrequency(TypeII$ProbeSeqA, letters = "R")
)
# probeseqA don't contain CG because designed to anneal to BS DNA,
# complement of CG is RC
# correct if there are no other Rs in the sequences, haven't studied
# in detail, but there do seem to be more probes with Rs than RC.
TypeII$nCpG[TypeII$nCpG < 0] <- 0L
TypeII$Name <- as.character(TypeII$Name )
TypeII$AddressA <- as.character(TypeII$AddressA)
TypeII$nCpG <- as.integer (TypeII$nCpG )
TypeSnpII <- TypeII[grep("^rs", TypeII$Name), ]
controls <- read.table(
file = file,
skip = seps[3],
sep = ",",
comment.char = "",
quote = "",
colClasses = c(rep("character", 5))
)[, 1:5]
TypeControl <- controls[, 1:4]
names(TypeControl) <- c("Address", "Type", "Color", "ExtendedType")
TypeControl <- as(TypeControl, "DataFrame")
TypeControl$Type <-as.character(TypeControl$Type )
TypeControl$Address <-as.character(TypeControl$Address)
list(
manifestList = list(
TypeI = TypeI,
TypeII = TypeII,
TypeControl = TypeControl,
TypeSnpI = TypeSnpI,
TypeSnpII = TypeSnpII),
manifest = manifest,
controls = controls)
}
#' canno - process csv manifest into annotation object for illumina methylation preprocessing
#'
#' @param man name
#' @return IlluminaMethylationManifest
#' @details
#' This is based on the scripts shipped with minfi annotation packages.
#' It is based on existing
#' Illumina Human Methylation csv format manifests, but because
#' reverse-engineered, may require
#' updates to work on future products.
#'
#'
#
canno <- function (man= "EPIC-8v2-0_A1.csv", name=NULL){
maniTmp <- read.manifest(man)
manifestList <- maniTmp$manifestList
if (is.null(name)) name <- basename(man)
IlluminaMethylationManifest(
TypeI = manifestList$TypeI,
TypeII = manifestList$TypeII,
TypeControl = manifestList$TypeControl,
TypeSnpI = manifestList$TypeSnpI,
TypeSnpII = manifestList$TypeSnpII,
annotation = name)
}
#' idet - identify idats by a hash of the addresses
#'
#' @param idat can be an idat file or the list produced by reading one with readIDAT()
#' @return three strings: ChipType, an md5 hash of the MidBlock (address vector) and if known, the annotation name
#' @details
#'this function is a response to the fact that IlluminaHumanMethylationEPIC
#' and EPICv2 idats both have the ChipType "BeadChip 8x5" but different
#' manifests. They did have different numbers of addresses.
#' Subsequently we have had some confusion....
#' This hash (certainly taken together with the ChipType) should be bomb
#' proof as an identifier. Warning: this is slow.
idet <- function(idat){
knowns <- c(
`2f96172b55a56a146c47b4d48cdfb3e0` =
"IlluminaHumanMethylationEPICmanifest" ,
`d4e153ca79918396f3371bad7ef8082d` =
"IlluminaHumanMethylationEPICmanifest" ,
`e28439b7c9a50e3a0f11252f02d09c7e` =
"IlluminaHumanMethylationEPICmanifest" ,
`485216d14ac9190d13df19681ec775c7` =
"IlluminaHumanMethylationEPICv2manifest",
`02b6dbc3e237054ac03ceb9e209b40cd` =
"IlluminaHumanMethylation450kmanifest" ,
`94aebb309839982e2f887a3d32ac8593` =
"MSA-48v1-0_20102838_A1.csv" ,
`882b1fa6f0cef31805e098a037310775` =
"MouseMethylation-12v1-0_A2.csv"
)
if(is.character(idat)) idat <- readIDAT(idat)
write(idat$MidBlock, x<-R.utils::tmpfile())
hesh <- tools::md5sum(x) #; unlink(x) tmpfile does this
names(hesh) <- NULL
c(idat$ChipType, hesh, knowns[hesh])
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.