# Notes ----
# TotalSeq_Barcodes is more complete than TotalSeq_Cocktails.
# TotalSeq_Cocktails just used to fill in missing isotype control names.
# Setup ----
library("readxl")
library("dplyr")
library("readr")
library("AbNames")
library("stringr")
library("janitor")
library("stringi")
#---------------------------------------------------------------------------
# Format TotalSeq barcode tables ----
# (Manually downloaded from
# https://www.biolegend.com/en-us/totalseq/barcode-lookup
# and converted to csv due to import error)
# Note: "NA" antigens are isotype controls and streptavidin conjugates
dest <- "~/Analyses/CITEseq_curation/data/totalseq_barcodes"
dest_fnames <- c("totalseq_a_antibodies.csv", "totalseq_b_antibodies.csv",
"totalseq_c_antibodies.csv", "totalseq_d_antibodies.csv")
dest_fnames <- file.path(dest, dest_fnames)
totalseq_cat <- c("A", "B", "C", "D")
ts_barcodes <- lapply(seq_along(dest_fnames), function(i){
readr::read_delim(dest_fnames[i]) %>%
dplyr::mutate(TotalSeq_Cat = totalseq_cat[i])
})
ts_barcodes <- do.call(bind_rows, ts_barcodes) %>%
dplyr::rename(Barcode_Sequence = Sequence,
ENSEMBL_ID = `Ensembl Gene Id`,
Date_Released = `Date Released`,
Cat_Number = `Catalog`,
Oligo_ID = Barcode,
Antigen = Description) %>%
dplyr::relocate(TotalSeq_Cat, .after = Clone) %>%
tidyr::separate(Reactivity, into = c("Reactivity", "Cross_Reactivity"),
sep =
", (<[Bb]>|<strong>)?Cross-Reactivity:(</[Bb]>|</strong>)? ",
fill = "right") %>%
dplyr::mutate(across(where(is.character), ~ stringr::str_squish(.x)),
ENSEMBL_ID = gsub(";|:|nbsp", "", ENSEMBL_ID)) %>%
dplyr::filter(! (is.na(Antigen) & is.na(Clone)))
# Check if one oligo is assigned to exactly one
# Antigen / Clone / TotalSeq_Cat comb
# (The only entry is same antigen written differently TCR Vα2 / TCR Vα2)
x <- ts_barcodes %>%
nPerGroup(group = "Oligo_ID",
col = c("Antigen", "Clone", "TotalSeq_Cat")) %>%
filter(if_any(c(nAntigen, nClone), ~.x > 1))
# Filter to remove non-human gene identifiers
ts_barcodes <- ts_barcodes %>%
splitUnnest(ab = "ENSEMBL_ID", split = ", ") %>%
dplyr::mutate(ENSEMBL_ID = gsub("^NSG", "ENSG", ENSEMBL_ID), # missing E
ENSEMBL_ID = gsub("\\.[0-9]$", "", ENSEMBL_ID), # suffix
ENSEMBL_ID = ifelse(grepl("ENSG[0-9]+$", ENSEMBL_ID),
ENSEMBL_ID, NA)) %>%
dplyr::group_by(Cat_Number) %>%
dplyr::mutate(ENSEMBL_ID = .toString(ENSEMBL_ID)) %>%
unique()
# From this information, it appears that Antigen / Oligo / TotalSeq_Cat is
# enough to fill in Cat_Number and Clone
# Is Clone enough to fill in Antigen?
#---------------------------------------------------------------------------
# Format TotalSeq cocktail tables
#---------------------------------------------------------------------------
# Download TotalSeq cocktail information ----
bl <- "https://www.biolegend.com/Files/Images/BioLegend/totalseq/"
tsa <- paste0(bl, "TotalSeq_A_Human_Universal_Cocktail_v1_163_",
"Antibodies_399907_Barcodes.xlsx")
tsb <- paste0(bl, "TotalSeq_B_Universal_Cocktail_v1_140_",
"Antibodies_399904_Barcodes.xlsx")
tsc <- paste0(bl, "TotalSeq_C_Human_Universal_Cocktail_v1_137_",
"Antibodies_399905_Barcodes.xlsx")
tsd <- paste0(bl, "TotalSeq_D_Human_Heme_Oncology_Cocktail_V01_Barcode.xlsx")
totalseq_fnames <- c(tsa, tsb, tsc, tsd)
totalseq <- lapply(totalseq_fnames, function(fn){
f <- tempfile()
download.file(fn, destfile = f)
readxl::read_xlsx(f)
})
nrows <- sapply(totalseq, nrow)
# Get the TotalSeq category from the filename and add to tables ----
ts_cat <- gsub(".*TotalSeq_([ABCD])_.*", "\\1", basename(totalseq_fnames))
ts_cat <- rep(ts_cat, nrows)
ts_cat <- split(ts_cat, ts_cat)
totalseq <- lapply(seq_along(totalseq), function(i) {
totalseq[[i]] %>% dplyr::mutate(TotalSeq_Cat = ts_cat[[i]])
})
totalseq <- Reduce(dplyr::full_join, totalseq) %>%
dplyr::mutate(across(.cols = everything(), stringr::str_squish))
# Check that no rows have been lost or gained ----
nrow(totalseq) == sum(nrows)
# Coalesce columns with the same meaning ----
totalseq <- totalseq %>%
dplyr::rename(ENSEMBL_ID = `Ensembl ID`,
HGNC_SYMBOL = `Gene Name`,
Barcode_Sequence = `Barcode sequence`,
Oligo_ID = DNA_ID,
Antigen = Description) %>%
dplyr::mutate(Barcode_Sequence =
dplyr::coalesce(Barcode_Sequence, Barcode, Sequence),
Oligo_ID = dplyr::coalesce(Oligo_ID, `Format / Barcode`),
Clone = dplyr::coalesce(Clone, clone),
ENSEMBL_ID = dplyr::coalesce(ENSEMBL_ID, `Ensemble ID`),
HGNC_SYMBOL = dplyr::coalesce(HGNC_SYMBOL, `Gene name`),
Antigen = dplyr::coalesce(Antigen, Specificity)) %>%
dplyr::select(-Barcode, -`Format / Barcode`, -clone,
-`Ensemble ID`, -`Gene name`, -Sequence, -Specificity) %>%
dplyr::mutate(Antigen = gsub("mouse/human", "human/mouse", Antigen),
Reactivity =
AbNames:::.gsubNA("^anti-([Hh]uman(/mouse)?(/rat)?).*$",
"\\1", Antigen),
Reactivity = tolower(Reactivity),
Antigen = gsub("^anti-([A-z\\/]+)\\s", "", Antigen),
# Two names start with Hu instead of "anti-human"
Antigen = gsub("^Hu\\s", "", Antigen),
Oligo_ID = substr(Oligo_ID, 2, nchar(Oligo_ID)),
Antigen = AbNames::replaceGreekSyms(Antigen, "sym2letter"))%>%
dplyr::relocate(Antigen, Clone, ENSEMBL_ID, HGNC_SYMBOL, Oligo_ID,
TotalSeq_Cat, Barcode_Sequence, Reactivity)
# Some TotalSeq B Ensembl_IDs are duplicated barcode sequences, set to NA ----
totalseq <- totalseq %>%
dplyr::mutate(ENSEMBL_ID = ifelse(grepl("^ENSG", ENSEMBL_ID),
ENSEMBL_ID, NA))
#---------------------------------------------------------------------------
# Fill barcodes using cocktails ----
# ts_barcodes have NA for Antigen for isotype control.
# Fill in Antigen name for controls using totalseq_cocktails table
temp <- totalseq %>%
dplyr::select(Antigen, Clone) %>%
unique() %>%
group_by(Clone) %>%
# If there is more than one antigen per clone, select the first
# (checked manually, they are the same with different names)
dplyr::slice_head(n = 1) %>%
ungroup()
ts_barcodes <- ts_barcodes %>%
dplyr::rows_patch(temp, unmatched = "ignore", by = c("Clone")) %>%
dplyr::filter(! is.na(Antigen),
# When "Clone" is na it's Biotin
! is.na(Clone)) %>%
dplyr::mutate(Cat_Number = as.character(Cat_Number),
ENSEMBL_ID = na_if(ENSEMBL_ID, "")) %>%
dplyr::ungroup()
totalseq <- ts_barcodes
# Fix importing errors
imp_err <- c("TCR Vα2" = "TCR Vα2", "Fc?RI?" = "FceRIA")
totalseq <- totalseq %>%
dplyr::mutate(Antigen = stringr::str_replace_all(Antigen,
imp_err,
names(imp_err))) %>%
# Select human or isotype control (Isotype controls have Reactivity NA)
dplyr::filter(if_any(c(Reactivity, Cross_Reactivity), ~grepl("Human", .x)) |
(is.na(Reactivity) & is.na(ENSEMBL_ID)))
#---------------------------------------------------------------------------
# Checks
#---------------------------------------------------------------------------
# Check that there is only one Ensembl_ID per Antigen ----
# These were manually identified by anti_joining totalseq to hgnc,
# by looking at one antigen -> multiple genes or one gene -> multiple antigens,
# and checking the information on the biolegend website.n
# Note: "CDw" stands for "workshop", insufficiently characterised
fixes <- tibble::tribble(
~Antigen, ~Clone, ~ENSEMBL_ID,
# Update Antigen with more information
"CD209 (DC-SIGN)", "9E9A8", "ENSG00000090659",
"CD209/CD299 (DC-SIGN/L-SIGN)", "14E3G7",
"ENSG00000090659, ENSG00000104938",
"CD85g", "17G10.2", "ENSG00000239961",
"EGFR", "AY13", "ENSG00000146648",
"GARP (LRRC32)", "7B11", "ENSG00000137507",
"VEGFR-3 (FLT-4)", "9D9F9", "ENSG00000037280",
"CD11a/CD18 (LFA-1)", "m24", "ENSG00000005844, ENSG00000160255",
"CD207", "4C7", "ENSG00000116031",
"TCR Vδ2", "B6", "ENSG00000211821",
# www.beckman.ch, TCR Vα7.2 = TCRAV7S2, gene cards TCRAV7S2 = TRAV1-2
# Gapin (2014) TRAV1-2 = TCR Vα7.2
# From Biolegend: Vα7.2 joined with Jα33 is characteristic of MAIT cells
"TCR Vα7.2", "3C10", "ENSG00000256553",
# From totalseq website - clone SK1 recognises the alpha chain
"CD8a", "SK1", "ENSG00000153563",
# Add (NCAM) for consistency with other clone
"CD56 (NCAM)", "QA17A16", "ENSG00000149294",
# Overwrite incorrect ENSEMBL_ID
# (discovered by multiple antigens to same gene id)
# (verified via BioLegend / genenames websites)
"CD6", "BL-CD6", "ENSG00000013725",
"CD164", "67D2", "ENSG00000135535",
"CD31", "WM59", "ENSG00000261371",
"CD140a", "16A1", "ENSG00000134853",
"CD24", "M1/69" ,"ENSG00000272398",
"TCR Vγ9", "B3","ENSG00000211695",
# Totalseq website: H1R2 reacts with common epitope of CD235a and CD235b
"CD235ab", "HIR2", "ENSG00000170180, ENSG00000250361",
# Totalseq website: CD98 is a heterodimer
"CD98","MEM-108","ENSG00000103257, ENSG00000168003",
# Totalseq website: 6D4 antibody reacts with common epitope of MICA/MICB
"MICA/MICB", "6D4", "ENSG00000204516, ENSG00000204520",
# Totalseq website: CD158 HP-MA4 rx with KIR2DL1, KIR2DS1, KIR2DS3, KIR2DS5
# KIR2DS3 AND KIR2DS5 are on haplotype chromosomes
"CD158", "HP-MA4",
"ENSG00000125498, ENSG00000276387, ENSG00000277163, ENSG00000274739",
# Totalseq website: CD158b DX27 reacts with common epitope of
# KIR2DL2, KIR2DL3 and KIR2DS2
"CD158b", "DX27", "ENSG00000274412, ENSG00000243772, ENSG00000278300",
# Totalseq website: CD16 3G8 interacts with FcGRIIIa and FGγRIIIb receptors
"CD16", "3G8", "ENSG00000203747, ENSG00000162747",
# Totalseq website: CD66a/c/e binds epitope shared by CD66a, c and e
"CD66a/c/e", "ASL-32", "ENSG00000079385, ENSG00000086548, ENSG00000105388",
# Totalseq website: M1310G05 has higher affinity for
# IgG1 and IgG3 than IgG2 and IgG4
"IgG FC", "M1310G05", "ENSG00000211896, ENSG00000211897",
"TCR Vα24-Jα18", "6B11", "ENSG00000211805, ENSG00000211871",
# Totalseq website: Clone W6/32 reacts with beta2-microglobulin
# (relabelling to beta2-microglobulin to avoid ambiguity with HLA gene ID)
"HLA-A,B,C", "W6/32", "ENSG00000166710",
# Assign CD3 to CD3E to match website
"CD3", "UCHT1", "ENSG00000198851",
"CD3", "SK7", "ENSG00000198851")
totalseq <- totalseq %>%
dplyr::rows_update(fixes, by = "Clone", unmatched = "ignore")
# Now set ENSEMBL_ID to NA if there are multiple for the same antigen
totalseq <- totalseq %>%
# Check for multiple ENSEMBL_IDs with the same Antigen
AbNames:::nPerGroup(group = c("Antigen"), "ENSEMBL_ID") %>%
# If there is more than one value of ENSEMBL_ID per Antigen set to zero
# (18/7/22) - this is just CD3 mapped to CD3E and CD3D, fixed above
dplyr::mutate(ENSEMBL_ID = ifelse(nENSEMBL_ID <= 1, ENSEMBL_ID, NA)) %>%
dplyr::select(-nENSEMBL_ID)
# Fill in missing gene IDs if Antigen, Clone, and Oligo_ID match ----
totalseq <- totalseq %>%
AbNames::fillByGroup(group = c("Antigen", "Clone"),
fill = c("ENSEMBL_ID"))
# Add HGNC_SYMBOL from hgnc data set ----
data(hgnc)
hgnc <- hgnc %>%
dplyr::select(HGNC_ID, HGNC_SYMBOL, ENSEMBL_ID) %>%
dplyr::filter(! is.na(ENSEMBL_ID)) %>%
dplyr::mutate(SOURCE == "TotalSeq")
unique()
totalseq <- left_join(totalseq, hgnc, by = c("ENSEMBL_ID"))
# Remove non-ascii characters -----
totalseq <- totalseq %>%
dplyr::mutate(across(where(is.character),
~stringi::stri_trans_general(.x,
id="Any-Latin;Greek-Latin;Latin-ASCII")))
# Clean up column names -----
# Remove Greek and other special characters ----
totalseq <- totalseq %>%
dplyr::mutate(across(where(is.character), replaceGreekSyms("sym2letter"))) %>%
dplyr::mutate(across(where(is.character),
~stringi::stri_trans_general(.x,
id="Any-Latin;Greek-Latin;Latin-ASCII")))
# Add ALT_ID from manual matches to the totalseq data set -----
original_nrow <- nrow(totalseq)
mm <- read_csv("inst/extdata/rols_ontology.csv") %>%
dplyr::mutate(across(where(is_character), stringr::str_squish))
mm_ids <- mm %>% dplyr::select(Antigen, Clone, HGNC_ID, HGNC_SYMBOL, ALT_ID)
# Manual check that all totalseq genes have a match, needs gene_aliases
#totalseq %>%
# dplyr::filter(is.na(HGNC_ID), grepl("[Hh]uman", Reactivity)) %>%
# dplyr::select(Antigen, Clone) %>%
# filter(! (Antigen %in% mm$Antigen |
# Clone %in% mm$Clone |
# Antigen %in% gene_aliases$value)) %>%
# unique() %>% arrange(Antigen) %>% data.frame()
# We do not want to join by HGNC_ID as e.g. TRA-1-60-R matches PODXL but is not
# in the totalseq table
totalseq <- totalseq %>%
AbNames:::left_join_any(mm_ids, cols = c("Antigen", "Clone"),
shared = "update") %>%
dplyr::mutate(ALT_ID = dplyr::coalesce(ALT_ID, HGNC_ID))
new_nrow <- nrow(totalseq)
# Check that totalseq did not gain rows during merge
original_nrow == new_nrow
# Create totalseq data set ----
totalseq <- as.data.frame(totalseq)
usethis::use_data(totalseq, overwrite = TRUE, compress = "bzip2")
# Code for manually checking for inconsistencies ----
# # Antigens that have ENSEMBL ID in hgnc but do not share an alias
# aj <- semi_join(totalseq, hgnc, by = c("ENSEMBL_ID")) %>%
# anti_join(hgnc, by = c("Antigen" = "value")) %>%
# left_join(hgnc, by = "ENSEMBL_ID") %>%
# dplyr::select(Antigen, HGNC_SYMBOL, value, Clone, ENSEMBL_ID) %>%
# group_by(Antigen)
#
# # Check different antigen mapped to same gene
# sg <- totalseq %>%
# group_by(ENSEMBL_ID) %>%
# dplyr::filter(n_distinct(Antigen) > 1)
#
#
# # Select genes with null reactivity or Ensembl ID
# nrx <- totalseq %>%
# dplyr::filter(is.na(Reactivity) | is.na(ENSEMBL_ID)) %>%
# dplyr::select(-Barcode_Sequence, -Date_Released,
# -Oligo_ID, -TotalSeq_Cat) %>%
# dplyr::arrange(Antigen, Clone) %>%
# dplyr::group_by(Antigen, Clone)
# Notes ----
# TCR Va24 = TRAV10 Gapin "Check MAIT", https://www.beckman.ch
# (IGRa02 is only described sequence)
# https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:12103
# https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:12121
# from beckman TCR Vβ13.1 is also called TCRBV13S1
# TCRBV13S1 is HGNC alias for TRBV6-5 HGNC:12230 and TRBV13 HGNC:12188
# To do:
# CD45RA, CD45RB, CD45R0, ,TRAV24
# HLA-A,B,C, HLA-A2
# HLA-DR L234 - does not cross react with DP and DQ - binds HLA-DRa
# HLA-DR, DP, DQ Tu39
# TRA-1-60-R != POXDL? TRA-1-81 != PODXL
# (from abcam) TRA-1-60 - a carbohydrate epitope associated with podocalyxin
# (from stemcell) TRA1-81 is a carbohydrate epitope associated with podocalyxin
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.