or relatedness

setGeneric("makeSparseMatrix", function(x, ...) standardGeneric("makeSparseMatrix"))

setMethod("makeSparseMatrix",
          "matrix",
          function(x, thresh = 2^(-11/2), sample.include = NULL, diag.value = NULL, verbose = TRUE){
            .makeSparseMatrix_matrix(x = x, thresh = thresh, sample.include = sample.include, diag.value = diag.value, verbose = verbose)
          })

setMethod("makeSparseMatrix",
          "Matrix",
          function(x, thresh = 2^(-11/2), sample.include = NULL, diag.value = NULL, verbose = TRUE){
          	.makeSparseMatrix_matrix(x = x, thresh = thresh, sample.include = sample.include, diag.value = diag.value, verbose = verbose)
          })

setMethod("makeSparseMatrix",
          "data.frame",
          function(x, thresh = NULL, sample.include = NULL, diag.value = NULL, verbose = TRUE){
          	.makeSparseMatrix_df(x = as.data.table(x), thresh = thresh, sample.include = sample.include, diag.value = diag.value, verbose = verbose)
          })

setMethod("makeSparseMatrix",
          "data.table",
          function(x, thresh = NULL, sample.include = NULL, diag.value = NULL, verbose = TRUE){
            .makeSparseMatrix_df(x = x, thresh = thresh, sample.include = sample.include, diag.value = diag.value, verbose = verbose)
          })


.makeSparseMatrix_matrix <- function(x, thresh = 2^(-11/2), sample.include = NULL, diag.value = NULL, verbose = TRUE){

    # keep R CMD check from warning about undefined global variables
    ID1 <- ID2 <- NULL

    # check that thresh is not NULL
    if(is.null(thresh)){
        stop("thresh must be specified when x is a matrix object")
    }

    # check for names
    if(is.null(colnames(x))) {
        if(is.null(rownames(x))) {
            stop("dimnames must be provided for matrix x to track sample order in the output matrix.")
        } else {
            colnames(x) <- rownames(x)
        }
    }
    if(is.null(rownames(x))) {
        rownames(x) <- colnames(x)
    }
    
    # check that matrix is square
    if(!all(rownames(x) == colnames(x))){
        stop("x must be square when it is a matrix object")
    }

    # check sample.include
    if(!is.null(sample.include)){
        # subset to samples in sample.include
        if(verbose) message('Using ', length(sample.include), ' samples in sample.include')
        x <- x[rownames(x) %in% sample.include, colnames(x) %in% sample.include]
    }else{
        # get list of all samples in the data
        sample.include <- sort(rownames(x))
        if(verbose) message("Using ", length(sample.include), " samples provided")
    }

    # check for diag values
    if(is.null(diag.value)){
        if(any(is.na(diag(x)))) stop('When `diag.value` is NULL, diagonal values must be provided for all samples')
    }

    # get the table of all related pairs
    rel <- apply(x, MARGIN = 1, FUN = function(v){ names(v)[v > thresh & !is.na(v)] })
    rel <- lapply(seq_along(rel), function(i) { data.table('ID1' = names(rel)[[i]], 'ID2' = rel[[i]]) })
    rel <- rbindlist(rel)
    rel <- rel[,`:=`(ID1 = as.character(ID1), ID2 = as.character(ID2))]
    setkeyv(rel, c('ID1', 'ID2'))

    # create graph of relatives
    if(verbose) message("Identifying clusters of relatives...")
    g <- igraph::graph_from_data_frame(rel[ID1 != ID2])
    # extract cluster membership
    clu <- igraph::components(g)
    mem <- clu$membership
    
    blocks <- list()
    block.id <- list()
    if(clu$no > 0){
        if(verbose) message("    ", length(mem), " relatives in ", clu$no, " clusters; largest cluster = ", max(clu$csize))
        if(verbose) message("Creating block matrices for clusters...")
        for(i in 1:clu$no){
            # samples in the cluster
            ids <- names(mem[mem == i])
            
            # extract the matrix for all pairs in the cluster
            submat <- x[rownames(x) %in% ids, colnames(x) %in% ids]

            # fix the diagonal
            if(!is.null(diag.value)){
                diag(submat) <- diag.value
            }

            # store in the list
            blocks[[i]] <- submat
            block.id[[i]] <- rownames(submat)
        }
    }else{
        if(verbose) message("    No clusters identified")
    }
        
    # add in identity matrix of unrelated samples
    unrel.id <- setdiff(sample.include, names(mem))
    if(verbose) message(length(unrel.id), " samples with no relatives included")

    if(length(unrel.id) > 0) {
        if(is.null(diag.value)){
            # data for the diagonal
            ddat <- diag(x)[rownames(x) %in% unrel.id]
            blocks[[clu$no + 1]] <- Diagonal(n = length(ddat), x = ddat)
            block.id[[clu$no + 1]] <- names(ddat)
        }else{
            blocks[[clu$no + 1]] <- Diagonal(n = length(unrel.id), x = diag.value)
            block.id[[clu$no + 1]] <- unrel.id
        }
    }

    # create block diagonal matrix
    if(length(blocks) > 1){
        if(verbose) message("Putting all samples together into one block diagonal matrix")
        mat_sparse <- bdiag(blocks)
    }else{
        mat_sparse <- Matrix(blocks[[1]])
    } 
    
    # ids of samples
    mat.id <- unlist(block.id)
    rownames(mat_sparse) <- mat.id
    colnames(mat_sparse) <- mat.id

    # set the matrix to symmetric to save memory
    mat_sparse <- as(mat_sparse, "symmetricMatrix")
    return(mat_sparse)
}

.makeSparseMatrix_df <- function(x, thresh = NULL, sample.include = NULL, diag.value = NULL, verbose = TRUE){

    # keep R CMD check from warning about undefined global variables
    ID1 <- ID2 <- value <- NULL

    # check sample.include
    if(!is.null(sample.include)){
        # subset to samples in sample.include
        if(verbose) message('Using ', length(sample.include), ' samples in sample.include')
        x <- x[ID1 %in% sample.include & ID2 %in% sample.include]
    }else{
        # get list of all samples in the data
        sample.include <- unique(rbind(setnames(unique(x[,'ID1']), 'ID1', 'ID'), setnames(unique(x[,'ID2']), 'ID2', 'ID'))[,'ID'])
        setkey(sample.include, 'ID')
        sample.include <- sample.include[['ID']]
        # sample.include <- sort(unique(c(x$ID1, x$ID2)))
        if(verbose) message("Using ", length(sample.include), " samples provided")
    }

    # check for diag values
    if(is.null(diag.value)){
    	if(!setequal(x[ID1 == ID2, ID1], sample.include)) stop('When `diag.value` is NULL, diagonal values must be provided for all samples')
    }

    # create graph of relatives
    if(verbose) message("Identifying clusters of relatives...")
    if(is.null(thresh)){
        g <- igraph::graph_from_data_frame(x[ID1 != ID2 & value != 0])
    }else{
        g <- igraph::graph_from_data_frame(x[ID1 != ID2 & value > thresh])
    }
    # extract cluster membership
    clu <- igraph::components(g)
    mem <- clu$membership
    
    blocks <- list()
    block.id <- list()
    if(clu$no > 0){
        if(verbose) message("    ", length(mem), " relatives in ", clu$no, " clusters; largest cluster = ", max(clu$csize))
        if(verbose) message("Creating block matrices for clusters...")
        for(i in 1:clu$no){
            # samples in the cluster
            ids <- names(mem[mem == i])
            # create a table for all pairs in the cluster
            allpairs <- as.data.table(expand.grid(ID2 = ids, ID1 = ids, stringsAsFactors=FALSE))
            
            # merge
            sub <- x[ID1 %in% ids & ID2 %in% ids][allpairs, on = c("ID1", "ID2")]
            # set pairs without values to 0
            sub[is.na(value), value := 0]
            
            # cast to a matrix
            submat <- reshape2::acast(data = sub, formula = ID1 ~ ID2, value.var = "value")
            # put the values on both sides of the diagonal
            submat <- submat + t(submat)
            # fix the diagonal
            if(is.null(diag.value)){
                diag(submat) <- 0.5*diag(submat)
            }else{
                diag(submat) <- diag.value
            }
            
            # store in the list
            blocks[[i]] <- submat
            block.id[[i]] <- rownames(submat)
        }
    }else{
        if(verbose) message("    No clusters identified")
    }
        
    # add in identity matrix of unrelated samples
    unrel.id <- setdiff(sample.include, names(mem))
    if(verbose) message(length(unrel.id), " samples with no relatives included")

    if(length(unrel.id) > 0) {
        if(is.null(diag.value)){
            # data for the diagonal
            ddat <- x[ID1 == ID2, c('ID1', 'value')][data.table(ID1 = unrel.id), on = 'ID1']
            blocks[[clu$no + 1]] <- Diagonal(n = nrow(ddat), x = ddat$value)
            block.id[[clu$no + 1]] <- ddat$ID1
        }else{
            blocks[[clu$no + 1]] <- Diagonal(n = length(unrel.id), x = diag.value)
            block.id[[clu$no + 1]] <- unrel.id
        }
    }

    # create block diagonal matrix
    if(length(blocks) > 1){
        if(verbose) message("Putting all samples together into one block diagonal matrix")
        mat_sparse <- bdiag(blocks)
    }else{
        mat_sparse <- Matrix(blocks[[1]])
    } 
    
    # ids of samples
    mat.id <- unlist(block.id)
    rownames(mat_sparse) <- mat.id
    colnames(mat_sparse) <- mat.id

    # set the matrix to symmetric to save memory
    mat_sparse <- as(mat_sparse, "symmetricMatrix")
    return(mat_sparse)
}




setGeneric("pcrelateToMatrix", function(pcrelobj, ...) standardGeneric("pcrelateToMatrix"))

setOldClass("pcrelate")
setMethod("pcrelateToMatrix",
          "pcrelate",
          function(pcrelobj, sample.include = NULL, thresh = NULL, scaleKin = 2, verbose = TRUE) {
              .pcrelateToMatrix(pcrelobj = pcrelobj,
                                sample.include = sample.include,
                                thresh = thresh,
                                scaleKin = scaleKin,
                                verbose = verbose)
          })

.pcrelateToMatrix <- function(pcrelobj, sample.include = NULL, thresh = NULL, scaleKin = 2, verbose = TRUE){
    # keep R CMD check from warning about undefined global variables
    f <- ID1 <- ID2 <- kin <- NULL
    
    # get the diagonals
    x <- as.data.table(pcrelobj$kinSelf)[, c('ID', 'f')]
    setnames(x, 'ID', 'ID1')
    x[, ID2 := ID1]
    x[, kin := 0.5*(1 + f)][, f := NULL]

    # append the off-diagonal
    x <- rbind(x, as.data.table(pcrelobj$kinBtwn)[,c('ID1', 'ID2', 'kin')])

    # scale the values
    x[, kin := scaleKin*kin]
    
    # set kin name to value
    setnames(x, 'kin', 'value')
    # call makeSparseMatrix
    makeSparseMatrix(x = x, thresh = thresh, sample.include = sample.include, diag.value = NULL, verbose = verbose)
}


setGeneric("kingToMatrix", function(king, ...) standardGeneric("kingToMatrix"))

setMethod("kingToMatrix",
          "character",
          function(king, estimator = c("PropIBD", "Kinship"), sample.include = NULL, thresh = NULL, verbose = TRUE) {
              .kingToMatrix(file.king = king,
                            estimator = estimator,
                            sample.include = sample.include,
                            thresh = thresh,
                            verbose = verbose)
          })

setOldClass("snpgdsIBDClass")
setMethod("kingToMatrix",
          "snpgdsIBDClass",
          function(king, sample.include = NULL, thresh = 2^(-11/2), verbose = TRUE) {
              ID <- king$sample.id
              king <- king$kinship
              dimnames(king) <- list(ID, ID)
              makeSparseMatrix(x = king, thresh = thresh, sample.include = sample.include, diag.value = 0.5, verbose = verbose)
          })

.readKing <- function(x, estimator) {
    cols <- intersect(names(fread(x, nrows=0)), c("ID1", "ID2", estimator))
    if (!(estimator %in% cols)) stop("Column ", estimator, " requested but not present in file")
    fread(x, select=cols, colClasses=list(character=c("ID1", "ID2")))
}

.kingToMatrix <- function(file.king, estimator = c("PropIBD", "Kinship"), sample.include = NULL, thresh = NULL, verbose = TRUE){
    # keep R CMD check from warning about undefined global variables
    PropIBD <- Kinship <- value <- NULL
    #ID1 <- ID2 <- PropIBD <- Kinship <- value <- NULL
    #`.` <- function(...) NULL

    # check argument values
    estimator <- match.arg(estimator)
    if(estimator == 'PropIBD'){
        if(verbose) message('Reading in PropIBD estimates from KING --ibdseg output...')
    }else if(estimator == 'Kinship'){
        if(verbose) message('Reading in Kinship estimates from KING --kinship output...')
    }
    
    # if multiple input files
    if(length(file.king) > 1){
        king <- lapply(file.king, .readKing, estimator = estimator)
        # pull out column names in common
        cnames <- Reduce(intersect, lapply(king, colnames))
        # subset and rbind
        king <- rbindlist(lapply(king, function(x){ x[, colnames(x) %in% cnames, with = FALSE]} ))
        
    # one input file
    }else{
        king <- .readKing(file.king, estimator = estimator)
    }
    
    # subset to needed columns
    if(estimator == 'PropIBD'){
        king <- king[, value := 0.5*PropIBD][, PropIBD := NULL]
    }else if(estimator == 'Kinship'){
        setnames(king, 'Kinship', 'value')
    }
        
    # check for duplicate pairs
    setkeyv(king, c('ID1', 'ID2'))
    if(any(duplicated(king))){
        stop('Some sample pairs are provided multiple times in file.king; please only provide one value per sample pair')
    }

    # call makeSparseMatrix
    makeSparseMatrix(x = king, thresh = thresh, sample.include = sample.include, diag.value = 0.5, verbose = verbose)
}

Any scripts or data that you put into this service are public.

GENESIS documentation built on Jan. 30, 2021, 2:01 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/makeSparseMatrix.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions .kingToMatrix .readKing .pcrelateToMatrix .makeSparseMatrix_df .makeSparseMatrix_matrix

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/makeSparseMatrix.R In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

Defines functions .kingToMatrix .readKing .pcrelateToMatrix .makeSparseMatrix_df .makeSparseMatrix_matrix

Try the GENESIS package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

GENESIS
GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness

R/makeSparseMatrix.R
In GENESIS: GENetic EStimation and Inference in Structured samples (GENESIS): Statistical methods for analyzing genetic data from samples with population structure and/or relatedness