GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

test_basic <- function()
{
    filename <- system.file(package="GenomeInfoDb",  "extdata",
                            "dataFiles", "Homo_sapiens.txt")
    checkTrue(file.exists(filename))

    # check the format of the file
    data <- read.table(filename,header=TRUE,sep="\t")
    checkIdentical(c(25L, 7L), dim(data))
    checkIdentical(c('circular', 'auto', 'sex',
                     'NCBI', 'UCSC', 'dbSNP', 'Ensembl'),
                   colnames(data))

    #check if first 3 columns contain only true or false entries
    checkEquals(c(FALSE,TRUE),unique(data[,1]))
    checkEquals(c(TRUE,FALSE),unique(data[,2]))
    checkEquals(c(FALSE,TRUE),unique(data[,3]))
}

test_guessSpeciesStyle <- function()
{
    got  <- GenomeInfoDb:::.guessSpeciesStyle(c(paste0("chr",1:10)))
    checkEquals(unique(got$style), "UCSC")

    got <- GenomeInfoDb:::.guessSpeciesStyle(c(paste0("chr",1:22)))
    checkEquals(unique(got$style), "UCSC")

    got <- GenomeInfoDb:::.guessSpeciesStyle("chr2")
    checkEquals(unique(got$style), "UCSC")

    got <- GenomeInfoDb:::.guessSpeciesStyle("2")
    checkEquals(unique(got$style), c("NCBI","Ensembl","MSU6","JGI2.F","AGPvF"))

    got <- GenomeInfoDb:::.guessSpeciesStyle('T')
    checkEquals(unique(got$style), "JGI2.F")

    got <- GenomeInfoDb:::.guessSpeciesStyle(c("chr1","chr2","chr3",
        "chr1_gl000191_random", "chr1_gl000192_random"))
    checkEquals(unique(got$style), "UCSC")

    got <-  GenomeInfoDb:::.guessSpeciesStyle("h")
    checkEquals(got, NA)
}

test_seqlevelsStyle_character <- function()
{
    #1. correct seqnames
    got <- seqlevelsStyle(c(paste0('chr',1:20)))
    checkEquals(got,"UCSC")

    #2. mix seqnames from 2 styles for same organism
    got2 <- seqlevelsStyle(c('1','MT','Pltd','chr1'))
    checkEquals(got2,"NCBI")

    #3. mix seqnames from 2 different organisms
    got2 <- seqlevelsStyle(c('1','chr2RHet','chr3LHet'))
    checkEquals(got2,"UCSC")

    #4. incorrect seqnames
    checkException(seqlevelsStyle(c('234','567','acv')))

    #5. empty Seqinfo obj - with no seqnames
    checkException(seqlevelsStyle(c('')))
    checkException(seqlevelsStyle(GRanges()))

}

.UCSC_NCBI_MAPPING_SUMMARY <- list(
    # Field 1: UCSC_genome
    # Field 2: NCBI_assembly
    # Field 3: nb of seqlevels that are mapped between UCSC and NCBI
    # Field 4: nb of UCSC seqlevels that are not mapped to NCBI
    # Field 5: nb of NCBI seqlevels that are not mapped to UCSC
    #     1           2                                  3      4        5
    list("apiMel2",  "Amel_2.0",                        16L,    1L,   7151L),
    #list("bosTau6",  "Bos_taurus_UMD_3.1",            3317L,    0L,      0L),
    list("bosTau7",  "Btau_4.6.1",                   11691L,    1L,      1L),
    list("bosTau8",  "Bos_taurus_UMD_3.1.1",          3179L,    0L,      0L),
    list("bosTau9",  "ARS-UCD1.2",                    2211L,    0L,      1L),
    list("calJac3",  "Callithrix jacchus-3.2",       14205L,    0L,      0L),
    list("calJac4",  "Callithrix_jacchus_cj1700_1.1",  964L,    0L,      0L),
    list("canFam3",  "CanFam3.1",                     3268L,    0L,      0L),
    list("canFam4",  "UU_Cfam_GSD_1.0",               2198L,    0L,      0L),
    list("canFam5",  "UMICH_Zoey_3.1",                 794L,    0L,      0L),
    list("canFam6",  "Dog10K_Boxer_Tasha",             147L,    0L,      0L),
    list("ce6",      "WS190",                            7L,    0L,      0L),
    list("ce10",     "WBcel215",                         7L,    0L,      0L),
    list("ce11",     "WBcel235",                         7L,    0L,      0L),
    list("criGri1",  "C_griseus_v1.0",               52710L,    1L,      0L),
    list("danRer7",  "Zv9",                           1133L,    0L,      0L),
    list("danRer10", "GRCz10",                        1061L,    0L,      0L),
    list("danRer11", "GRCz11",                        1923L,    0L,      0L),
    list("dm3",      "Release 5",                       14L,    1L,      0L),
    list("dm6",      "Release 6 plus ISO1 MT",        1870L,    0L,      0L),
    list("equCab2",  "EquCab2.0",                       33L,    1L,   9604L),
    list("equCab3",  "EquCab3.0",                     4701L,    0L,      0L),
    list("felCat9",  "Felis_catus_9.0",               4508L,    0L,      0L),
    list("gadMor1",  "GadMor_May2010",              427427L,    1L,      0L),
    list("galGal3",  "Gallus_gallus-2.1",                34L,   23L,  17118L),
    list("galGal4",  "Gallus_gallus-4.0",            15932L,    0L,      0L),
    list("galGal5",  "Gallus_gallus-5.0",            23475L,    0L,      0L),
    list("galGal6",  "GRCg6a",                         464L,    0L,      0L),
    #list("gasAcu1",  "ASM18067v1", <--- no such mapping despite UCSC claim),
    list("gorGor6",  "Kamilah_GGO_v0",                5486L,    0L,      0L),
    list("hg15",     "NCBI33",                          24L,   20L,    140L),
    #list("hg16",     "NCBI34",                          24L,   18L,    138L),
    #list("hg17",     "NCBI35",                          26L,   20L,     86L),
    #list("hg18",     "NCBI36",                          26L,   23L,     97L),
    list("hg19",     "GRCh37.p13",                     297L,    1L,      0L),
    list("hg38",     "GRCh38.p14",                     711L,    0L,     -2L),
    list("hs1",      "T2T-CHM13v2.0",                   25L,    0L,      0L),
    list("loxAfr3",  "Loxafr3.0",                     2352L,    1L,      1L),
    list("macFas5",  "Macaca_fascicularis_5.0",       7601L,    0L,      0L),
    list("mm8",      "MGSCv36",                         21L,   13L,    360L),
    list("mm9",      "MGSCv37",                         22L,   13L,    283L),
    list("mm10",     "GRCm38.p6",                      239L,    0L,      0L),
    list("mm39",     "GRCm39",                          61L,    0L,      0L),
    list("monDom5",  "MonDom5",                          10L,    1L,   5128L),
    list("musFur1",  "MusPutFur1.0",                  7741L,    0L,     42L),
    list("oreNil2",  "Orenil1.1",                     5678L,    0L,      0L),
    list("panPan1",  "panpan1",                      10867L,    0L,      0L),
    list("panPan2",  "panpan1.1",                    10274L,    0L,      0L),
    list("panPan3",  "Mhudiblu_PPA_v0",               4293L,    0L,      0L),
    list("panTro2",  "Pan_troglodytes-2.1",             26L,   26L,  29214L),
    list("panTro3",  "Pan_troglodytes-2.1.3",        24131L,    1L,      0L),
    list("panTro4",  "Pan_troglodytes-2.1.4",        24129L,    0L,      0L),
    list("panTro5",  "Pan_tro 3.0",                  44449L,    0L,      0L),
    list("panTro6",  "Clint_PTRv2",                   4346L,    0L,      0L),
    list("rheMac2",  "Mmul_051212",                     21L,    1L, 122143L),
    list("rheMac3",  "CR_1.0",                       34102L,    1L,      0L),
    list("rheMac8",  "Mmul_8.0.1",                  284728L,    0L,      0L),
    list("rheMac10", "Mmul_10",                       2939L,    0L,      0L),
    list("rn5",      "Rnor_5.0",                      2739L,    0L,      0L),
    list("rn6",      "Rnor_6.0",                       953L,    0L,      2L),
    list("rn7",      "mRatBN7.2",                      176L,    0L,      0L),
    list("sacCer3",  "R64",                             17L,    0L,      0L),
    list("susScr2",  "Sscrofa9.2",                      19L,    1L,      0L),
    list("susScr3",  "Sscrofa10.2",                   4583L,    0L,      0L),
    list("susScr11", "Sscrofa11.1",                    613L,    0L,      0L),
    list("taeGut2",  "Taeniopygia_guttata-3.2.4",    37096L,    0L,      0L),
    #list("wuhCor1",  "ASM985889v3",                      1L,    0L,      0L),
    list("xenLae2",  "Xenopus_laevis_v2",           108033L,    0L,      0L),
    list("xenTro10", "UCB_Xtro_10.0",                  167L,    0L,      0L)
)

test_switching_Seqinfo_between_UCSC_and_NCBI_styles <- function()
{
    check_UCSC_NCBI_switch <- function(UCSC_genome, NCBI_assembly,
                                       nmapped, UCSC_nunmapped, NCBI_nunmapped)
    {
        ## Start with a Seqinfo object made from a UCSC genome.

        si1 <- Seqinfo(genome=UCSC_genome)
        UCSC_nseqlevels <- nmapped + UCSC_nunmapped
        checkEquals(UCSC_nseqlevels, length(si1))
        checkIdentical("UCSC", seqlevelsStyle(si1))

        si2 <- si1
        seqlevelsStyle(si2) <- "NCBI"
        ugenomes <- unique(genome(si2))
        ## UGLY HACK! We need to special-case hg38 because it contains 2
        ## sequences that do NOT belong to GRCh38.p14. But they can be
        ## found in GRCh38.p13!
        if (UCSC_genome == "hg38") {
            checkIdentical(c("GRCh38.p14", "GRCh38.p13"), ugenomes)
            checkIdentical("NCBI", seqlevelsStyle(si2))
        } else if (UCSC_nunmapped == 0L) {
            checkIdentical(NCBI_assembly, ugenomes)
            checkIdentical("NCBI", seqlevelsStyle(si2))
        } else {
            checkTrue(setequal(c(NCBI_assembly, UCSC_genome), ugenomes))
            checkEquals(nmapped, sum(genome(si2) == NCBI_assembly))
            checkTrue(setequal(c("NCBI", "UCSC"), seqlevelsStyle(si2)))
        }

        seqlevelsStyle(si2) <- "UCSC"
        checkIdentical(si1, si2)

        ## Start with a Seqinfo object made from an NCBI assembly.

        si1 <- Seqinfo(genome=NCBI_assembly)
        NCBI_nseqlevels <- nmapped + NCBI_nunmapped
        checkEquals(NCBI_nseqlevels, length(si1))
        checkIdentical("NCBI", seqlevelsStyle(si1))

        si2 <- si1
        seqlevelsStyle(si2) <- "UCSC"
        ugenomes <- unique(genome(si2))
        if (NCBI_nunmapped <= 0L) {  # <= 0 because of hg38 special case!
            checkIdentical(UCSC_genome, ugenomes)
            checkIdentical("UCSC", seqlevelsStyle(si2))
        } else {
            ## 'ugenomes' will almost always be 'c(UCSC_genome, NCBI_assembly)'
            ## but the order is not 100% guaranteed (e.g. for
            ## musFur1/MusPutFur1.0 it's the opposite order).
            #checkIdentical(c(UCSC_genome, NCBI_assembly), ugenomes)
            checkEquals(2L, length(ugenomes))
            checkTrue(setequal(c(UCSC_genome, NCBI_assembly), ugenomes))
            checkEquals(nmapped, sum(genome(si2) == UCSC_genome))
            ## 'seqlevelsStyle(si2)' will almost always return c("UCSC", "NCBI")
            ## but the order is not 100% guaranteed (e.g. for
            ## musFur1/MusPutFur1.0 it's the opposite order).
            #checkIdentical(c("UCSC", "NCBI"), seqlevelsStyle(si2))
            checkEquals(2L, length(seqlevelsStyle(si2)))
            checkTrue(setequal(c("UCSC", "NCBI"), seqlevelsStyle(si2)))
        }

        seqlevelsStyle(si2) <- "NCBI"
        checkIdentical(si1, si2)
    }

    for (i in seq_along(.UCSC_NCBI_MAPPING_SUMMARY)) {
        args <- .UCSC_NCBI_MAPPING_SUMMARY[[i]]
        do.call(check_UCSC_NCBI_switch, args)
    }
}

test_switching_Seqinfo_to_RefSeq_style <- function()
{
    ## We do our best to guess 'seqlevelsStyle(si3)' based on what seqnames
    ## have changed when we first switched from UCSC to NCBI style and when
    ## we switched again from NCBI to RefSeq style.
    guess_expected_style3 <- function(has_changed_12, has_changed_23)
    {
        stopifnot(is.logical(has_changed_12),
                  is.logical(has_changed_23),
                  length(has_changed_12) == length(has_changed_23))
        expected_style <- character(0)
        if (!all(has_changed_12 | has_changed_23))
            expected_style <- c(expected_style, "UCSC")
        if (any(has_changed_12 > has_changed_23))
            expected_style <- c(expected_style, "NCBI")
        if (any(has_changed_23))
            expected_style <- c(expected_style, "RefSeq")
        expected_style
    }

    check_RefSeq_switch <- function(UCSC_genome, NCBI_assembly)
    {
        is_RefSeq_accession <- GenomeInfoDb:::.is_RefSeq_accession

        ## UGLY HACK! Hardcode list of NCBI seqlevels (+ their UCSC names)
        ## NOT associated with a RefSeq accession in the "Full sequence
        ## report" for GRCh38.p14:
        if (NCBI_assembly == "GRCh38.p14") {
            problematic_NCBI_seqlevels <- c(
                "HSCHR10_1_CTG4",
                "HSCHR11_CTG1_UNLOCALIZED",
                "HSCHR22_UNLOCALIZED_CTG4",
                "HSCHRUN_RANDOM_CTG29"
            )
            ## Their UCSC names.
            problematic_UCSC_seqlevels <- c(
                "chr10_KI270825v1_alt",
                "chr11_KI270721v1_random",
                "chr22_KI270734v1_random",
                "chrUn_KI270752v1"
            )
        } else {
            problematic_NCBI_seqlevels <- character(0)
            problematic_UCSC_seqlevels <- character(0)
        }

        ## Start with a Seqinfo object made from a UCSC genome.

        si1 <- Seqinfo(genome=UCSC_genome)
        ## Remove problematic UCSC seqlevels:
        si1 <- si1[setdiff(seqlevels(si1), problematic_UCSC_seqlevels)]

        si2 <- si1
        seqlevelsStyle(si2) <- "NCBI"
        has_changed_12 <- seqnames(si1) != seqnames(si2)

        si3 <- si2
        seqlevelsStyle(si3) <- "RefSeq"
        checkIdentical(unname(genome(si2)), unname(genome(si3)))
        has_changed_23 <- seqnames(si2) != seqnames(si3)

        current_style3 <- seqlevelsStyle(si3)
        expected_style3 <- guess_expected_style3(has_changed_12, has_changed_23)
        checkTrue(setequal(current_style3, expected_style3))

        ## UGLY HACK! We need to special-case hg38 because it contains 2
        ## sequences that do NOT belong to GRCh38.p14. But they can be
        ## found in GRCh38.p13!
        if (UCSC_genome == "hg38") {
            genome_is_ok <- genome(si2) %in% c("GRCh38.p14", "GRCh38.p13")
        } else {
            genome_is_ok <- genome(si2) == NCBI_assembly
        }
        checkTrue(all(genome_is_ok | !has_changed_23))
        checkTrue(all(is_RefSeq_accession(seqnames(si3)[has_changed_23])))

        si4 <- si1
        seqlevelsStyle(si4) <- "RefSeq"
        checkIdentical(si3, si4)
        seqlevelsStyle(si4) <- "UCSC"
        checkIdentical(si1, si4)

        seqlevelsStyle(si3) <- "NCBI"
        checkIdentical(si2, si3)

        seqlevelsStyle(si2) <- "UCSC"
        checkIdentical(si1, si2)

        ## Start with a Seqinfo object made from an NCBI assembly.

        si1 <- Seqinfo(genome=NCBI_assembly)
        ## Remove problematic NCBI seqlevels:
        si1 <- si1[setdiff(seqlevels(si1), problematic_NCBI_seqlevels)]

        si2 <- si1
        seqlevelsStyle(si2) <- "UCSC"

        si3 <- si1
        seqlevelsStyle(si3) <- "RefSeq"
        checkIdentical(unname(genome(si1)), unname(genome(si3)))
        style <- seqlevelsStyle(si3)
        checkTrue(identical("RefSeq", style) ||
                  identical(c("RefSeq", "NCBI"), style))
        has_changed <- seqnames(si3) != seqnames(si1)
        checkTrue(all(is_RefSeq_accession(seqnames(si3)[has_changed])))

        si4 <- si2
        seqlevelsStyle(si4) <- "RefSeq"
        checkIdentical(si3, si4)
        seqlevelsStyle(si4) <- "NCBI"
        checkIdentical(si1, si4)

        seqlevelsStyle(si2) <- "NCBI"
        checkIdentical(si1, si2)
    }

    ## Exclude some problematic genomes from the RefSeq switch check:
    ## o In the case of canFam4, canFam5, and hs1, it's because chrM is not
    ##   mapped to a RefSeq accession, that is, RefSeqAccn is NA:
    ##   - for chrM in getChromInfoFromNCBI("UU_Cfam_GSD_1.0");
    ##   - for chrM in getChromInfoFromNCBI("UMICH_Zoey_3.1");
    ##   - for MT in getChromInfoFromNCBI("T2T-CHM13v2.0").
    ## o In the case of gadMor1, panTro3, and rheMac3, it's because
    ##   the sequences in associated NCBI assemblies GadMor_May2010,
    ##   Pan_troglodytes-2.1.3, and CR_1.0, have no RefSeq accessions
    ##   assigned to them.
    skip_RefSeq_switch <- c("canFam4", "canFam5", "gadMor1",
                            "hs1", "panTro3", "rheMac3")
    for (i in seq_along(.UCSC_NCBI_MAPPING_SUMMARY)) {
        args <- .UCSC_NCBI_MAPPING_SUMMARY[[i]][1:2]
        if (args[[1L]] %in% skip_RefSeq_switch)
            next
        do.call(check_RefSeq_switch, args)
    }
}

test_genomeStyles <- function()
{
    checkIdentical("data.frame", class(genomeStyles("Homo sapiens")))
    checkIdentical(c(25L, 7L), dim(genomeStyles("Homo sapiens")))
    checkException(genomeStyles("SAD"))
}

test_extractSeqlevels <- function()
{
    got <- extractSeqlevels("Homo sapiens", "UCSC" )
    checkEquals(25,length(got))
    checkEquals("character",class(got))

    checkException(extractSeqlevels("aaa","Homo sapiens"))
    checkException(extractSeqlevels("Drosophila melanogaster"))
}

test_extractSeqlevelsByGroup <- function()
{
    got <- extractSeqlevelsByGroup("Drosophila melanogaster","Ensembl","auto")
    checkEquals(5,length(got))
    checkEquals("character",class(got))

    checkException(extractSeqlevelsByGroup("aaa","Homo sapiens"))
    checkException(extractSeqlevelsByGroup("Drosophila melanogaster"))
    checkException(extractSeqlevelsByGroup("Homo sapiens","auto","NCBI"))
}

test_seqlevelsInGroup <- function()
{
    newch <- paste0("chr",c(1:22,"X","Y","M","1_gl000192_random","4_ctg9_hap1"))
    got1 <- seqlevelsInGroup(newch, group="sex")
    checkEquals(c("chrX","chrY"),got1)

    newchr <- as.character(c(1:22,"X","Y","MT"))
    got2 <- seqlevelsInGroup(newchr, group="all","Homo sapiens","NCBI")
    checkEquals(25,length(got2))
}
Bioconductor/GenomeInfoDb documentation built on Jan. 27, 2025, 12:24 p.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
Bioconductor/GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

inst/unitTests/test_seqlevelsStyle.R
In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/GenomeInfoDb Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

inst/unitTests/test_seqlevelsStyle.R In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

R Package Documentation

Browse R Packages

We want your feedback!

Bioconductor/GenomeInfoDb
Utilities for manipulating chromosome names, including modifying them to follow a particular naming style

inst/unitTests/test_seqlevelsStyle.R
In Bioconductor/GenomeInfoDb: Utilities for manipulating chromosome names, including modifying them to follow a particular naming style