R/gdsWrapper.R

Defines functions addUpdateSegment addUpdateLap addGDS1KGLDBlock addGDSStudyPruning runLDPruning runIBDKING gds2tped gds2tfamSample gds2tfam generateGDS1KGgenotypeFromSNPPileup appendGDSgenotypeMat appendGDSgenotype generateGDSgenotype generateGDSSNPinfo addStudyGDSSample appendGDSSampleOnly appendGDSSample addGDSRef generateGDSSample

Documented in addGDS1KGLDBlock addGDSRef addGDSStudyPruning addStudyGDSSample addUpdateLap addUpdateSegment appendGDSgenotype appendGDSgenotypeMat appendGDSSample appendGDSSampleOnly gds2tfam gds2tfamSample gds2tped generateGDS1KGgenotypeFromSNPPileup generateGDSgenotype generateGDSSample generateGDSSNPinfo runIBDKING runLDPruning

#' @title Initialization of the section related to the sample
#' information in the \code{gds} file.
#'
#' @description This function initializesthe section related to the sample
#' information in the \code{gds} file. The information is extracted from
#' the \code{data.frame} \code{pedDF} passed to the function.
#'
#' @param gds a \code{gds}.
#'
#' @param pedDF a \code{data.frame} containing the information related to the
#' sample. It must have those columns: "sample.id", "Name.ID", "sex",
#' "pop.group", "superPop" and "batch". The unique id of pedDF
#' is Name.ID and the row.name is Name.ID too.
#'
#' @param listSamples a \code{array} with the sample from pedDF to keep
#'
#' @return the a \code{array} with the sample from pedDF keept
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn
#' @encoding UTF-8
#' @keywords internal
generateGDSSample <- function(gds, pedDF, listSamples=NULL){

    if(!(is.null(listSamples))){
        pedDF <- pedDF[listSamples,]
    }
    add.gdsn(gds, "sample.id", pedDF[, "Name.ID"])

    ## Create a data.frame containing the information form the samples
    samp.annot <- data.frame(sex=pedDF[, "sex"],
                                pop.group=pedDF[, "pop.group"],
                                superPop=pedDF[, "superPop"],
                                batch=pedDF[, "batch"],
                                stringsAsFactors=FALSE)

    ## Add the data.frame to the gds object
    add.gdsn(gds, "sample.annot", samp.annot)

    return(pedDF[, "sample.id"])

}

#' @title The function add an array sample.ref to the gds file.It define base
#' on a list of unrelated samples.
#'
#' @description This function create the field sample.ref which is 1 when
#' the samples are a reference and 0 otherwise. The sample.ref is fill based
#' on the file filePart$unrels
#' from  in GENESIS TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param filePart a \code{list} from the function pcairPartition in GENESIS
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn
#' @encoding UTF-8
#' @keywords internal
addGDSRef <- function(gds, filePart) {

    part <- readRDS(filePart)

    sampleGDS <- index.gdsn(gds, "sample.id")
    df <- data.frame(sample.id=read.gdsn(sampleGDS),
                        sample.ref=0, stringsAsFactors=FALSE)

    # The order of part$unrels is not the same than df$sample.id
    df[df$sample.id %in% part$unrels, "sample.ref"] <- 1
    add.gdsn(gds, "sample.ref", df$sample.ref, storage="bit1")

    return(0L)
}


#' @title This function append the fields related to the samples. If the
#' samples are part of a study you must uses the addStudyGDSSample
#'
#' @description This function append the fields related to the samples.
#' The fields append are sample.id and the \code{data.frame} sample.annot.
#' If the samples are in the section study the field related to the
#' study must be fill.
#'
#' @param gds a \code{gds}.
#'
#' @param pedDF a \code{data.frame} with the sample info. Must have the column
#' sample.id, Name.ID, sex, pop.group, superPop and batch. The unique id
#' of pedDF is Name.ID and the row.name is Name.ID too.
#'
#' @param listSamples a \code{array} with the sample from pedDF$Name.ID to keep
#'
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
appendGDSSample <- function(gds, pedDF, batch=1, listSamples=NULL){

    if(!(is.null(listSamples))){
        pedDF <- pedDF[listSamples,]
    }

    sampleGDS <- index.gdsn(gds, "sample.id")

    append.gdsn(sampleGDS,  val=pedDF$Name.ID, check=TRUE)


    samp.annot <- data.frame(sex = pedDF[, "sex"],
                                pop.group=pedDF[, "pop.group"],
                                superPop=pedDF[, "superPop"],
                                batch=rep(batch, nrow(pedDF)),
                                stringsAsFactors=FALSE)

    print("Annot")
    curAnnot <- index.gdsn(gds, "sample.annot/sex")
    append.gdsn(curAnnot,samp.annot$sex, check=TRUE)
    curAnnot <- index.gdsn(gds, "sample.annot/pop.group")
    append.gdsn(curAnnot, samp.annot$pop.group, check=TRUE)
    curAnnot <- index.gdsn(gds, "sample.annot/superPop")
    append.gdsn(curAnnot, samp.annot$superPop, check=TRUE)
    curAnnot <- index.gdsn(gds, "sample.annot/batch")
    append.gdsn(curAnnot, samp.annot$batch, check=TRUE)
    print("Annot done")

    return(0L)
}

#' @title This function append the fields sample.id.
#'
#' @description This function append the fields samples.id.
#' The fields append are sample.id with the listSample
#'
#' @param gds a \code{gds}.
#'
#' @param listSample a \code{array} of sample.id to add.
#'
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
appendGDSSampleOnly <- function(gds, listSamples) {

    sampleGDS <- index.gdsn(gds, "sample.id")

    append.gdsn(sampleGDS, val=listSamples, check=TRUE)

    return(0L)
}



#' @title This function create the gds file fields related to the study and
#' the sample in it.
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param pedDF a \code{data.frame} with the sample info. Must have the column
#' sample.id, Name.ID, sex, pop.group, superPop and batch. The unique id of
#' pedDF is Name.ID and the row.name is Name.ID too.
#'
#' @param batch a \code{integer} corresponding
#'
#' @param listSamples a \code{array} with the sample from pedDF to keep
#'
#' @param studyDF a \code{data.frame} with at least the column study.id,
#' study.desc and study.platform
#'
#' @return TODO
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn append.gdsn
#' @encoding UTF-8
#' @keywords internal
addStudyGDSSample <- function(gds, pedDF, batch=1, listSamples=NULL, studyDF) {

    if(sum(c("study.id", "study.desc", "study.platform") %in%
                colnames(studyDF)) != 3 ) {
        stop("studyDF incomplete in addStudyGDSSample\n")
    }

    if(!(is.null(listSamples))) {
        if(length(listSamples) == length(intersect(listSamples,
                                                    rownames(pedDF)))) {
            pedDF <- pedDF[listSamples,]
        } else {
            stop("List of samples not include in the ped or ped ",
                    "don't have rownames equal to Name.ID\n")
        }
    } else {
        listSamples <- pedDF$Name.ID
    }

    # sampleGDS <- index.gdsn(gds, "sample.id")

    # samplePres <- read.gdsn(sampleGDS)
    # print(paste0("appendGDSSample ", Sys.time()))
    # appendGDSSample(gds, pedDF, batch=batch, listSamples=listSamples)
    # print(paste0("appendGDSSample DONE ", Sys.time()))

    # add.gdsn(gds, "study.offset", length(samplePres))

    df <- data.frame(study.id=studyDF$study.id,
                        study.desc=studyDF$study.desc,
                        study.platform=studyDF$study.platform,
                        stringsAsFactors=FALSE)

    if(! "study.list" %in% ls.gdsn(gds)){
        add.gdsn(gds, "study.list", df)


        study.annot <- data.frame(data.id=pedDF[, "Name.ID"],
                                    case.id=pedDF[, "Case.ID"],
                                    sample.type=pedDF[, "Sample.Type"],
                                    diagnosis=pedDF[, "Diagnosis"],
                                    source=pedDF[, "Source"],
                                    study.id=rep(studyDF$study.id, nrow(pedDF)),
                                    stringsAsFactors=FALSE)

        add.gdsn(gds, "study.annot", study.annot)
        print(paste0("study.annot DONE ", Sys.time()))
    } else{
        #append.gdsn(gds, "study.list", df)
        append.gdsn(index.gdsn(gds, "study.list/study.id"), df$study.id, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.list/study.desc"), df$study.desc, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.list/study.platform"), df$study.platform, check=TRUE)


        study.annot <- data.frame(data.id=pedDF[, "Name.ID"],
                                  case.id=pedDF[, "Case.ID"],
                                  sample.type=pedDF[, "Sample.Type"],
                                  diagnosis=pedDF[, "Diagnosis"],
                                  source=pedDF[, "Source"],
                                  study.id=rep(studyDF$study.id, nrow(pedDF)),
                                  stringsAsFactors=FALSE)

        append.gdsn(index.gdsn(gds, "study.annot/data.id"), study.annot$data.id, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.annot/case.id"), study.annot$case.id, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.annot/sample.type"), study.annot$sample.type, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.annot/diagnosis"), study.annot$diagnosis, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.annot/source"), study.annot$source, check=TRUE)
        append.gdsn(index.gdsn(gds, "study.annot/study.id"), study.annot$study.id, check=TRUE)
        print(paste0("study.annot DONE ", Sys.time()))
    }
    return(pedDF[,"Name.ID"])

}


#' @title This function create the fields related to the snp
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param fileFREQ string with the path and the fileNames to a files with
#' frequence information TODO describe the file
#'
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn
#' @encoding UTF-8
#' @keywords internal
generateGDSSNPinfo <- function(gds, fileFREQ) {

    mapSNVSel <- readRDS(file=fileFREQ)
    print(paste0("Read mapSNVSel DONE ", Sys.time()))

    add.gdsn(node=gds, name="snp.id", paste0("s",seq_len(nrow(mapSNVSel))))
    print(paste0("SNP part snp.id DONE ", Sys.time()))
    add.gdsn(node=gds, name="snp.chromosome",
            as.integer(gsub("chr", "", mapSNVSel$CHROM)), storage = "uint16")
    print(paste0("SNP part snp.chromosome DONE ", Sys.time()))
    add.gdsn(node=gds, name="snp.position", as.integer(mapSNVSel$POS),
                storage="int32")
    print(paste0("SNP part snp.position DONE ", Sys.time()))
    add.gdsn(node=gds, name="snp.allele",
                paste0(mapSNVSel$REF, "/", mapSNVSel$ALT))
    print(paste0("SNP part 1 DONE ", Sys.time()))
    add.gdsn(node=gds, name="snp.AF", as.numeric(mapSNVSel$AF),
                    storage="packedreal24")
    print(paste0("SNP part AF DONE ", Sys.time()))
    add.gdsn(node=gds, name="snp.EAS_AF", val=as.numeric(mapSNVSel$EAS_AF),
                    storage="packedreal24")
    add.gdsn(node=gds, name="snp.EUR_AF", val=as.numeric(mapSNVSel$EUR_AF),
                    storage="packedreal24")
    add.gdsn(node=gds, name="snp.AFR_AF", val=as.numeric(mapSNVSel$AFR_AF),
                    storage="packedreal24")
    add.gdsn(node=gds, name="snp.AMR_AF", val=as.numeric(mapSNVSel$AMR_AF),
                    storage="packedreal24")
    add.gdsn(node=gds, name="snp.SAS_AF", val=as.numeric(mapSNVSel$SAS_AF),
                    storage="packedreal24")

    return(0L)
}



#' @title This function create the field genotype in the gds file
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param PATHGENO TODO a PATH to a directory with the a file for each samples
#' with the genotype.
#'
#' @param fileLSNP TODO list of SNP to keep in the file genotype
#'
#' @param listSamples a \code{array} with the sample to keep
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn write.gdsn
#' @importFrom utils read.csv2
#' @encoding UTF-8
#' @keywords internal
generateGDSgenotype <- function(gds, PATHGENO, fileLSNP, listSamples) {

    # File with the description of the SNP keep
    listMat1k <- dir(PATHGENO, pattern=".+.csv.bz2")
    listSample1k <- gsub(".csv.bz2", "", listMat1k)

    listSNP <- readRDS(fileLSNP)

    for(i in seq_len(length(listSamples))) {
        pos <- which(listSample1k == listSamples[i])
        print(listSamples[i])
        if( length(pos) == 1) {
            matSample <- read.csv2(file.path(PATHGENO, listMat1k[pos]),
                                        row.names=NULL)
            matSample <- matSample[listSNP,, drop=FALSE]
            if(i == 1) {
                var.geno <- add.gdsn(gds, "genotype",
                                valdim=c(nrow(matSample),
                                            length(listSamples)),
                                            storage="bit2")
            }

            # Not faster but harder to read
            # matSample[,1] <- rowSums(t(matrix(as.numeric(unlist(strsplit( matSample[,1], "\\|"))),nr=2)))
            # Easier to read
            matSample[matSample[,1] == "0|0",1] <- 0
            matSample[matSample[,1] == "0|1" | matSample[,1] == "1|0",1] <- 1
            matSample[matSample[,1] == "1|1",1] <- 2

            g <- as.matrix(matSample)[,1]
            write.gdsn(var.geno, g, start=c(1, i), count=c(-1,1))
            rm(matSample)
            print(paste0(listMat1k[pos], " ", i))
        }else{
            stop("Missing samples genotype in ", listSamples[i])
        }
    }

    return(0L)
}

#' @title This function append the field genotype in the gds file
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param PATHGENO TODO a PATH to a directory with the a file for each
#' samples with the genotype.
#'
#' @param fileLSNP TODO list of SNP to keep in the file genotype
#'
#' @param listSamples  a \code{array} with the sample to keep
#'
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils read.csv2
#' @encoding UTF-8
#' @keywords internal
appendGDSgenotype <- function(gds, listSample, PATHGENO, fileLSNP) {

    # File with the description of the SNP keep
    listMat1k <- dir(PATHGENO, pattern = ".+.csv.bz2")
    listSample1k <- gsub(".csv.bz2", "", listMat1k)

    listSNP <- readRDS(file=fileLSNP)
    geno.var <- index.gdsn(gds, "genotype")
    g <- read.gdsn(node=geno.var, start=c(1, 1), count=c(1,-1))
    nbSample <- length(g)
    print(nbSample)
    for(i in seq_len(length(listSample))) {
        pos <- which(listSample1k == listSample[i])
        if( length(pos) == 1) {
            matSample <- read.csv2(file.path(PATHGENO, listMat1k[pos]),
                                        row.names = NULL)
            matSample <- matSample[listSNP,, drop=FALSE]


            # Not faster but harder to read
            # matSample[,1] <- rowSums(t(matrix(as.numeric(unlist(strsplit( matSample[,1], "\\|"))),nr=2)))
            # Easier to read
            matSample[matSample[,1] == "0|0",1] <- 0
            matSample[matSample[,1] == "0|1" | matSample[,1] == "1|0",1] <- 1
            matSample[matSample[,1] == "1|1",1] <- 2

            g <- as.matrix(matSample)[,1]
            append.gdsn(geno.var,g, check=TRUE)

            rm(matSample)
            print(paste0(listMat1k[pos], " ", i))
        }else {
            stop("Missing 1k samples ", listSample[i])
        }
    }

    return(0L)
}

#' @title This function append the field genotype in the gds file
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param matG  a \code{matrix} with the genotype
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils read.csv2
#' @encoding UTF-8
#' @keywords internal
appendGDSgenotypeMat <- function(gds, matG) {

    geno.var <- index.gdsn(gds, "genotype")
    append.gdsn(node=geno.var, val=matG, check=TRUE)

    return(0L)
}


#' @title TODO This function append the genotype and the file related to the
#' pileup
#'
#' @description TODO
#'
#' @param PATHGENO TODO a PATH to a directory with the a file for each
#' samples with the genotype.
#'
#'
#' @param listSamples  a \code{array} with the sample to keep
#'
#' @param listPos  a \code{array}
#'
#' @param offset  a \code{integer} to adjust if the genome start at 0 or 1
#'
#' @param minCov  a \code{array} with the sample to keep
#'
#' @param minProb  a \code{array} with the sample to keep
#'
#' @param seqError  a \code{array} with the sample to keep
#'
#' @param pedDF a \code{data.frame} with the sample info. Must have the column
#' sample.id, Name.ID, sex, pop.group, superPop and batch. The unique id of
#' pedDF is Name.ID and the row.name is Name.ID too.
#'
#' @param batch a \code{integer} corresponding
#'
#' @param studyDF a \code{data.frame} with at least the column study.id,
#' study.desc and study.platform
#'
#' @param PATHGDSSAMPLE TODO a PATH to a directory where a gds specific
#' to the samples with coverage info is keep
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' ## Path to the demo pedigree file is located in this package
#' data.dir <- system.file("extdata", package="RAIDS")
#'
#' ## TODO
#' gds <- "Demo GDS TODO"
#'
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn write.gdsn openfn.gds
#' @importFrom stats qbinom
#' @importFrom utils read.csv
#' @encoding UTF-8
#' @keywords internal
generateGDS1KGgenotypeFromSNPPileup <- function(PATHGENO,
                                                listSamples,listPos, offset,
                                                minCov=10, minProb=0.999,
                                                seqError=0.001,
                                                pedStudy,
                                                batch,
                                                studyDF,
                                                PATHGDSSAMPLE=NULL) {



    # File with the description of the SNP keep
    listMat <- dir(PATHGENO, pattern = ".+.txt.gz")
    listSampleFile <- gsub(".txt.gz", "", listMat)


    g <- as.matrix(rep(-1, nrow(listPos)))

    for(i in seq_len(length(listSamples))) {
        pos <- which(listSampleFile == listSamples[i])
        print(paste0(listSamples[i], " ", Sys.time(), " ", i))
        if(length(pos) == 1){

            matSample <- read.csv(file.path(PATHGENO, listMat[pos]))


            matSample[, "Chromosome"] <- as.integer(gsub("chr", "",
                                                    matSample[, "Chromosome"]))
            matSample[, "Position"] <- matSample[, "Position"] + offset
            matSample[, "count"] <- rowSums(matSample[, c("File1R", "File1A",
                                                    "File1E", "File1D")])

            # matAll <- merge(matSample[,c( "Chromosome", "Position",
            #                               "File1R",  "File1A",
            #                               "count" )],
            #                 listPos,
            #                 by.x = c("Chromosome", "Position"),
            #                 by.y = c("snp.chromosome", "snp.position"),
            #                 all.y = TRUE,
            #                 all.x = FALSE)
            #
            # below same as the merge above but faster

            z <- cbind(c(listPos$snp.chromosome,
                            matSample$Chromosome,
                            matSample$Chromosome),
                        c(listPos$snp.position,
                            matSample$Position,
                            matSample$Position),
                        c(rep(1,nrow(listPos)),
                            rep(0,nrow(matSample)),
                            rep(2,nrow(matSample))),
                        c(rep(0,nrow(listPos)),
                            matSample[, "File1R"],
                            -1 * matSample[, "File1R"]),
                        c(rep(0,nrow(listPos)),
                            matSample[, "File1A"],
                            -1 * matSample[, "File1A"]),
                        c(rep(0,nrow(listPos)),
                            matSample[, "count"],
                            -1 * matSample[, "count"]))
            rm(matSample)
            z <- z[order(z[,1], z[,2], z[,3]),]

            matAll <- data.frame(Chromosome=z[z[,3]==1, 1],
                                    Position=z[z[,3]==1, 2],
                                    File1R=cumsum(z[,4])[z[,3]==1],
                                    File1A=cumsum(z[,5])[z[,3]==1],
                                    count=cumsum(z[,6])[z[,3]==1])
            rm(z)



            if(is.null(PATHGDSSAMPLE)){
                stop("PATHGDSSAMPLE is NULL in ",
                        "generateGDS1KGgenotypeFromSNPPileup\n")
            } else{
                if(! dir.exists(PATHGDSSAMPLE)) {
                    dir.create(PATHGDSSAMPLE)
                }
            }
            fileGDSSample <- file.path(PATHGDSSAMPLE,
                                        paste0(listSamples[i], ".gds"))
            if(file.exists(fileGDSSample)) {
                gdsSample <- openfn.gds(fileGDSSample, readonly=FALSE)
            } else{
                gdsSample <- createfn.gds(fileGDSSample)

                # id <- add.gdsn(gdsSample, "sampleStudy",
                #                 listSamples[i])
            }

            if(! "Ref.count" %in% ls.gdsn(gdsSample)){
                var.Ref <- add.gdsn(gdsSample, "Ref.count",
                                        matAll$File1R,
                                        valdim=c( nrow(listPos), 1),
                                        storage="sp.int16")
                var.Alt <- add.gdsn(gdsSample, "Alt.count",
                                        matAll$File1A,
                                        valdim=c( nrow(listPos), 1),
                                        storage="sp.int16")
                var.Count <- add.gdsn(gdsSample, "Total.count",
                                        matAll$count,
                                        valdim=c( nrow(listPos), 1),
                                        storage="sp.int16")
            } else {
                # you must append
                var.Ref <- append.gdsn(index.gdsn(gdsSample, "Ref.count"),
                                    matAll$File1R)
                var.Alt <- append.gdsn(index.gdsn(gdsSample, "Alt.count"),
                                       matAll$File1A)
                var.Count <- append.gdsn(index.gdsn(gdsSample, "Total.count"),
                                         matAll$count)
            }

            listSampleGDS <- addStudyGDSSample(gdsSample, pedDF=pedStudy,
                                                batch=batch,
                                                listSamples=c(listSamples[i]),
                                                studyDF=studyDF)

                #closefn.gds(gdsSample)



            listCount <- table(matAll$count[matAll$count >= minCov])
            cutOffA <- data.frame(count = unlist(vapply(as.integer(names(listCount)),
                                                        FUN=function(x, minProb, eProb){return(max(2,qbinom(minProb, x,eProb)))},
                                                        FUN.VALUE = numeric(1), minProb=minProb, eProb= 2 * seqError )),
                        allele = unlist(vapply(as.integer(names(listCount)),
                                    FUN=function(x, minProb, eProb){return(max(2,qbinom(minProb, x,eProb)))},
                                    FUN.VALUE = numeric(1), minProb=minProb, eProb=seqError)))
            row.names(cutOffA) <- names(listCount)
            # Initialize the genotype array at -1


            # Select the position where the coverage of the 2 alleles is enough
            listCov <- which(rowSums(matAll[, c("File1R", "File1A")]) >= minCov)


            matAllC <- matAll[listCov,]


            # The difference  depth - (nb Ref + nb Alt) can be realisticly explain by sequencing error
            listCov <- listCov[(matAllC$count -
                                (matAllC$File1R + matAllC$File1A)) <
                                cutOffA[as.character(matAllC$count), "count"]]

            matAllC <- matAll[listCov,]
            rm(matAll)

            g <- as.matrix(rep(-1, nrow(listPos)))
            # The sample is homozygote if the other known allele have a coverage of 0
            g[listCov][which(matAllC$File1A == 0)] <- 0
            g[listCov][which(matAllC$File1R == 0)] <- 2

            # The sample is heterozygote if explain the coverage of the minor allele by
            # sequencing error is not realistic.
            g[listCov][which(matAllC$File1A >= cutOffA[as.character(matAllC$count), "allele"] &
                        matAllC$File1R >= cutOffA[as.character(matAllC$count), "allele"])] <- 1

            #g <- as.matrix(g)
            if("geno.ref" %in% ls.gdsn(gdsSample)){
                var.geno <- index.gdsn(gdsSample, "geno.ref")

                compression.gdsn(var.geno, compress="")
                #write.gdsn(cur, x, start=c(1, 2), count=c(-1,1))
                append.gdsn(var.geno,g, check=TRUE)
                compression.gdsn(var.geno, compress="LZMA_RA.fast")
                readmode.gdsn(var.geno)

            }else{
                var.geno <- add.gdsn(gdsSample, "geno.ref",
                                     valdim=c(length(g),
                                              1),
                                     g,
                                     storage="bit2", compress = "LZMA_RA.fast")
                readmode.gdsn(var.geno)
            }


            rm(g)
            closefn.gds(gdsfile=gdsSample)
            print(paste0(listMat[pos], " ", i, " ", Sys.time()))

        }else{
            stop("Missing samples ", listSamples[i])
        }
    }

    return(0L)
}



#' @title create a file tfam file for plink from the gds file
#'
#' @description TODO
#'
#' @param gds an object of class \link[gdsfmt]{gdsn.class} (a GDS node) or
#' \link[gdsfmt]{gds.class} (a GDS file) TODO
#'
#' @param listSample  a \code{array} with the sample to keep TODO
#'
#' @param pedOUT TODO a PATH and file name to the output file
#'
#' @return TODO a \code{vector} of \code{numeric}
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils write.table
#' @encoding UTF-8
#' @keywords export
gds2tfam <- function(gds, listSample, pedOUT) {

    sampleGDS <- index.gdsn(gds, "sample.id")
    sampleId <-read.gdsn(sampleGDS)
    listS <- which(sampleId %in% listSample)

    sampleGDS <- index.gdsn(gds, "sample.annot")
    sampleANNO <-read.gdsn(sampleGDS)

    pedFile <- data.frame(famId=paste0("F", seq_len(length(listSample))),
                            id=sampleId[listS],
                            fa=rep("0",length(listSample)),
                            mo=rep("0",length(listSample)),
                            sex=sampleANNO$sex[listS],
                            pheno=rep(1,length(listSample)),
                            stringsAsFactors=FALSE)

    write.table(pedFile, pedOUT,
                    quote=FALSE, sep="\t",
                    row.names=FALSE,
                    col.names=FALSE)

}

#' @title create a file tfam file for plink from the gds file
#'
#' @description TODO
#'
#' @param gds an object of class \link[gdsfmt]{gdsn.class} (a GDS node) or
#' \link[gdsfmt]{gds.class} (a GDS file) TODO
#'
#' @param listSample  a \code{array} with the sample to keep
#'
#' @param sampleANNO a \code{data.frame} with at least column sex and the name
#' must be sample.id
#'
#' @param pedOUT TODO a PATH and file name to the output file
#'
#'
#' @return TODO a \code{vector} of \code{numeric}
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils write.table
#' @encoding UTF-8
#' @keywords export
gds2tfamSample <- function(gds, listSample, sampleANNO, pedOUT) {

    sampleGDS <- index.gdsn(gds, "sample.id")
    sampleId <-read.gdsn(sampleGDS)
    listS <- which(sampleId %in% listSample)

    pedFile <- data.frame(famId=paste0("F", seq_len(length(listSample))),
                            id=sampleId[listS],
                            fa=rep("0",length(listSample)),
                            mo=rep("0",length(listSample)),
                            sex=sampleANNO[sampleId[listS], "sex"],
                            pheno=rep(1,length(listSample)),
                            stringsAsFactors=FALSE)

    write.table(pedFile, pedOUT, quote=FALSE, sep="\t",
                    row.names=FALSE, col.names=FALSE)

}


#' @title create a file tped file for plink from the gds file
#'
#' @description TODO
#'
#' @param gds a \code{gds} object.
#'
#' @param listSample  a \code{array} with the sample to keep
#'
#' @param listSNP  a \code{array} with the snp.id to keep
#'
#' @param pedOUT TODO a PATH and file name to the output file
#'
#' @return TODO a \code{vector} of \code{numeric}
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt index.gdsn read.gdsn
#' @importFrom utils write.table
#' @encoding UTF-8
#' @keywords export
gds2tped <- function(gds, listSample, listSNP, pedOUT) {

    sampleGDS <- index.gdsn(gds, "sample.id")
    sampleId <-read.gdsn(sampleGDS)
    listS <- which(sampleId %in% listSample)

    snpGDS <- index.gdsn(gds, "snp.id")
    snpId <- read.gdsn(snpGDS)
    listKeep <- which(snpId %in% listSNP)
    snpId <- snpId[listKeep]

    snpGDS <- index.gdsn(gds, "snp.chromosome")
    snpChr <- read.gdsn(snpGDS)
    snpChr <- snpChr[listKeep]

    snpGDS <- index.gdsn(gds, "snp.position")
    snpPos <- read.gdsn(snpGDS)
    snpPos <- snpPos[listKeep]

    tped <- list()
    tped[[1]] <- snpChr
    tped[[2]] <- snpId
    tped[[3]] <- rep(0,length(snpId))
    tped[[4]] <- snpPos
    k<-4
    geno.var <- index.gdsn(gds, "genotype")
    for(i in listS){
        k <- k + 1

        tmp <- read.gdsn(geno.var, start=c(1, i), count=c(-1,1))[listKeep]

        # 0 1 0 1 0 1
        tped[[k]] <- (tmp == 2) + 1
        k <- k + 1
        tped[[k]] <- (tmp > 0) + 1

    }

    #tped <- do.call(cbind, tped)

    write.table(tped, pedOUT,
                    quote=FALSE, sep="\t",
                    row.names=FALSE,
                    col.names=FALSE)

}


#' @title Function just wrap snpgdsIBDKING
#'
#' @description TODO
#'
#' @param gds an object of class
#' \code{\link[SNPRelate:SNPGDSFileClass]{SNPRelate::SNPGDSFileClass}}, a SNP
#' GDS file.
#'
#' @param sampleId  a \code{vector} of \code{character} strings representing
#' the samples to keep for the analysis. If \code{NULL}, all samples are used.
#' Default: \code{NULL}.
#'
#' @param snp.id  a \code{vector} of \code{character} strings representing
#' the SNPs to keep for the analysis. If \code{NULL}, all SNPs are used.
#' Default: \code{NULL}.
#'
#' @param maf  a single \code{numeric} representing the threshold for the minor
#' allele frequency. Only the SNPs with ">= maf" are retained.
#' Default: \code{0.05}.
#'
#' @return a \code{list} containing:
#' \itemize{
#'     \item{sample.id}{a \code{character} string representing the sample
#'     ids used in the analysis}
#'     \item{snp.id}{a \code{character} string representing the SNP ids
#'     used in the analysis}
#'     \item{k0}{a \code{numeric}, the IBD coefficient, the probability of
#'     sharing zero IBD}
#'     \item{k1}{a \code{numeric}, the IBD coefficient, the probability of
#'     sharing one IBD}
#'     \item{IBS0}{a \code{numeric}, the proportion of SNPs with zero IBS}
#'     \item{kinship}{a \code{numeric}, the proportion of SNPs with zero IBS,
#'     if the parameter kinship=TRUE}
#' }
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom SNPRelate snpgdsIBDKING
#' @encoding UTF-8
#' @keywords internal
runIBDKING <- function(gds, sampleId=NULL, snp.id=NULL, maf=0.05) {

    ## Calculate IBD coefficients by KING method of moment
    ibd.robust <- snpgdsIBDKING(gds, sample.id=sampleId,
                                    snp.id=snp.id,
                                    maf=maf, type="KING-robust")
    return(ibd.robust)
}


#' @title TODO just a wrapper of snpgdsLDpruning
#'
#' @description TODO
#'
#' @param gds an object of class \code{gds} opened
#'
#' @param method the parameter method in SNPRelate::snpgdsLDpruning
#'
#' @param listSamples A \code{vector} of \code{string} corresponding to
#' the sample.ids
#' use in LDpruning
#'
#' @param listKeep the list of snp.id keep
#'
#' @param slide.max.bp.v TODO
#'
#' @param ld.threshold.v TODO
#'
#' @param np a single positive \code{integer} specifying the number of
#' threads to be used. Default: \code{1}.
#'
#' @param verbose.v a \code{logical} indicating if information is shown during
#' the process. Default: \code{FALSE}.
#'
#' @return a \code{list} of SNP IDs stratified by chromosomes as generated
#' by \code{\link[SNPRelate:snpgdsLDpruning]{SNPRelate:snpgdsLDpruning()}}.
#'
#' @examples
#'
#' ## Path to the demo pedigree file is located in this package
#' data.dir <- system.file("extdata", package="RAIDS")
#'
#' ## TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#'
#' @importFrom SNPRelate snpgdsOpen snpgdsLDpruning
#' @importFrom gdsfmt closefn.gds
#' @encoding UTF-8
#' @keywords internal
runLDPruning <- function(gds, method="corr",
                            listSamples=NULL,
                            listKeep=NULL,
                            slide.max.bp.v = 5e5,
                            ld.threshold.v=sqrt(0.1),
                            np=1, verbose.v=FALSE) {

    # validate the para
    # showfile.gds(closeall=FALSE, verbose=TRUE)

    snpset <- snpgdsLDpruning(gds, method="corr",
                                sample.id=listSamples,
                                snp.id=listKeep,
                                slide.max.bp=slide.max.bp.v,
                                ld.threshold=ld.threshold.v,
                                num.thread=np,
                                verbose=verbose.v)
    # closefn.gds(gds)
    return(snpset)
}


#' @title fill the pruned.study in the gds file
#'
#' @description TODO
#'
#' @param gds an object of class \code{gds} opened for the sample
#'
#' @param pruned TODO
#'
#' @param sample.id A \code{string} corresponding to
#' the sample.id
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
addGDSStudyPruning <- function(gds, pruned, sample.id) {

    if("pruned.study" %in% ls.gdsn(gds)) {
        delete.gdsn(index.gdsn(node=gds, "pruned.study"))
    }

    var.Pruned <- add.gdsn(gds, "pruned.study", pruned)

    sync.gds(gdsfile=gds)

    return(0L)
}

#' @title TODO
#'
#' @description TODO
#'
#' @param gds an object of class \code{gds} opened for the sample
#'
#' @param listBlock TODO
#'
#' @param blockName TODO
#'
#' @param blockDesc TODO
#'
#' @return The integer \code{0L} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn ls.gdsn compression.gdsn append.gdsn sync.gds
#' @encoding UTF-8
#' @keywords internal
addGDS1KGLDBlock <- function(gds, listBlock, blockName, blockDesc) {

    block.annot <- data.frame(block.id=blockName,
                                block.desc=blockDesc,
                                stringsAsFactors=FALSE)

    if(! ("block.annot" %in% ls.gdsn(gds))) {
        var.block.annot <- add.gdsn(gds, "block.annot", block.annot)
    }else {
        curAnnot <- index.gdsn(gds, "block.annot/block.id")
        append.gdsn(curAnnot,block.annot$block.id)
        curAnnot <- index.gdsn(gds, "block.annot/block.desc")
        append.gdsn(curAnnot, block.annot$block.desc)
    }

    var.block <- NULL
    if(! ("block" %in% ls.gdsn(gds))){
        var.block <- add.gdsn(gds, "block",
                                valdim=c(length(listBlock), 1),
                                listBlock, storage="int32",
                                compress = "LZ4_RA")
        readmode.gdsn(var.block)

    }else {
        if(is.null(var.block)) {
            var.block <- index.gdsn(gds, "block")
            var.block <- compression.gdsn(var.block, "")
        }
        append.gdsn(var.block, listBlock)
        var.block <- compression.gdsn(var.block, "LZ4_RA")
    }
    sync.gds(gds)
    return(0L)
}



#' @title TODO
#'
#' @description TODO
#'
#' @param gds an object of class \code{gds} opened for the sample
#'
#' @param snp.lap TODO
#'
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
addUpdateLap <- function(gds, snp.lap) {

    snpLap <- write.gdsn(index.gdsn(gds, "lap"), snp.lap)

    sync.gds(gds)

    return(0L)
}


#' @title TODO
#'
#' @description TODO
#'
#' @param gds an object of class \code{gds} opened for the sample
#'
#' @param snp.seg TODO
#'
#'
#' @return The integer \code{0} when successful.
#'
#' @examples
#'
#' # TODO
#' gds <- "Demo GDS TODO"
#'
#' @author Pascal Belleau, Astrid Deschênes and Alexander Krasnitz
#' @importFrom gdsfmt add.gdsn index.gdsn delete.gdsn sync.gds ls.gdsn
#' @encoding UTF-8
#' @keywords internal
addUpdateSegment <- function(gds, snp.seg) {

    if("segment" %in% ls.gdsn(gds)) {
        snpLap <- write.gdsn(index.gdsn(gds, "segment"), snp.seg)
    } else{
        snpLap <- add.gdsn(gds, "segment", snp.seg, storage="uint32")
    }

    sync.gds(gds)

    return(0L)
}
belleau/aicsPaper documentation built on Aug. 4, 2022, 1:12 a.m.