library(cBioPortalData)
cbioportal <- cBioPortal()
studyIds <- getStudies(cbioportal)[["studyId"]]
pancans <- grepl("pan_can", studyIds, fixed = TRUE)
allIds <- studyIds[!pancans]
allIds <- setNames(allIds, allIds)
devtools::load_all()
res <- lapply(allIds, function(study) {
ptids <- .invoke_bind(cbioportal, "getAllPatientsInStudyUsingGET",
FALSE, studyId = study)[["patientId"]]
ca <- .invoke_bind(cbioportal, "getAllClinicalAttributesInStudyUsingGET",
FALSE, studyId = study)
ethrace <- c("ETHNICITY", "RACE")
if (all(ethrace %in% ca[["clinicalAttributeId"]]))
tidyr::spread(
.invoke_bind(
cbioportal, "fetchAllClinicalDataInStudyUsingPOST", FALSE,
clinicalDataType = "PATIENT",
studyId = study,
attributeIds = c("ETHNICITY", "RACE"),
ids = ptids,
projection = "SUMMARY"
),
clinicalAttributeId, value)
else
tibble::tibble()
})
noData <- names(Filter(function(g) !length(g), res))
results <- Filter(length, res)
save(results, file ="allStudiesWithRE.rda")
## percentage of studies with RACE and ETHNICITY attributeIds
round((length(results)/length(allIds))*100, 1)
allptids <- unname(unlist(lapply(results, `[`, "patientId")))
dups <- duplicated(allptids)
duplist <- split(dups, rep(names(results), vapply(results, nrow, integer(1L))))
orderduplist <- duplist[names(results)]
stopifnot(identical(names(results), names(orderduplist)))
urest <- Map(function(x, y) {
x[!y, ]
}, x = results, y = orderduplist)
urest <- Filter(nrow, urest)
allrows <- dplyr::bind_rows(urest)
length(unique(allrows[["studyId"]]))
race2 <- gsub("\\[|\\]", "", tolower(allrows[["RACE"]]))
amin <- grepl("american indian", race2, fixed = TRUE)
race2[amin] <- "Am. Indian / Hawaiian or P.I."
hwpacific <- c("samoan", "hawaiian", "fiji islander",
"native hawaiian or other pacific islander")
race2[race2 %in% hwpacific] <- "Am. Indian / Hawaiian or P.I."
cauc <- grepl("caucasian", race2, fixed = TRUE)
cauc <- cauc | race2 == "white"
race2[cauc] <- "White"
asianos <- grepl("[^cauc]asian", race2)
asianos <- asianos | race2 %in% c("filipino", "laotian", "asian")
race2[asianos] <- "Asian"
ab <- c("african american", "black", "black or african american")
race2[race2 %in% ab] <- "Black or African Am."
unk <- c("not reported", "not evaluated")
race2[race2 %in% unk] <- "unknown"
other <- c("unknown", "other", "hispanic")
race2[race2 %in% other] <- "Other / Unknown"
table(race2)
allrows[["race"]] <- race2
library(ggplot2)
dat <- allrows %>% group_by(race) %>% summarize(counts = n())
dat <- mutate(dat, percentage = round(counts / sum(counts) * 100, 1))
dat_sort <- arrange(dat, percentage)
dat_sort$race <- factor(dat_sort$race, levels = dat_sort$race)
dat_sort <- dat_sort %>% mutate(database = "cBioPortal")
ggplot(data = dat_sort, aes(x = database, y = percentage, fill = race)) +
geom_col() +
geom_text(aes(label = paste0(percentage, "%")),
position = position_stack(vjust = 0.5)) +
scale_fill_brewer(palette = "Set3") +
theme_minimal(base_size = 24) + theme(axis.text.x = element_blank()) +
ylab("Percentage") +
xlab(paste0(length(unique(allrows[["studyId"]])), " cBioPortal Studies")) +
labs(fill = "Race")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.