R/cacheauthors.R
In datelife: Scientific Data on Time of Lineage Divergence for Your Taxa

Documented in make_all_associations make_contributor_cache make_otol_associations make_overlap_table make_treebase_associations make_treebase_cache

#' Create a cache from Open Tree of Life
#' @param outputfile Path including file name
#' @return List containing author and curator results
#' @export
make_contributor_cache <- function(outputfile = "contributorcache.RData") {
  all.sources <- rotl::tol_about(include_source_list = TRUE)
  all.studies <- as.list(rep(NA, length(all.sources$source_list)))
  author.results <- data.frame()
  curator.results <- data.frame()
  missed_doi <- c()
  for (i in sequence(length(all.studies))) {
    study.id <- strsplit(all.sources$source_list[[i]], "@")[[1]][[1]]
    try(all.studies[[i]] <- rotl::get_study_meta(study.id)) # some studies have no metadata
    doi <- authors <- clade.name <- curator <- study.year <- NULL
    try(doi <- gsub("https?://(dx\\.)?doi.org/", "", attr(rotl::get_publication(all.studies[[i]]), "DOI")))
    if (length(doi) == 0) {
      missed_doi <- c(missed_doi, study.id)
      warning(paste(study.id, "has no DOI attribute, author names will not be retrieved.")) # where else could we obtain a study's doi???
    } else {
      try(authors <- as.character(knitcitations::bib_metadata(doi)$author))
      if (is.null(authors)) {
        warning(paste(study.id, "DOI page was not found, author names will not be retrieved."))
        missed_doi <- c(missed_doi, study.id)
      }
    }
    # The following is necessary to establish good encoding in author.results object:
    if (!is.null(authors) & length(authors) > 0) {
      Encoding(authors) <- "latin1"
      authors <- iconv(authors, "latin1", "UTF-8")
    }
    try(clade.name <- all.studies[i][[1]]$nexml$`^ot:focalCladeOTTTaxonName`)
    try(curators <- unique(rotl::get_study_meta(study.id)[["nexml"]][["^ot:curatorName"]][[1]]))
    if (!is.null(authors)) {
      # author.results <- rbind(author.results, expand.grid(author=authors, study=study.id, clade=clade.name, publication=get_publication(all.studies[[i]])[1], doi=attr(get_publication(all.studies[[i]]),"DOI")))
      author.results <- rbind(author.results, expand.grid(person = authors, study = study.id, clade = clade.name, stringsAsFactors = FALSE))
    }
    if (!is.null(curators)) {
      # author.results <- rbind(author.results, expand.grid(author=authors, study=study.id, clade=clade.name, publication=get_publication(all.studies[[i]])[1], doi=attr(get_publication(all.studies[[i]]),"DOI")))
      curator.results <- rbind(curator.results, expand.grid(person = curators, study = study.id, clade = clade.name, stringsAsFactors = FALSE))
    }
    message(i, " of ", length(all.studies), " done ")
  }
  message("Authors from studies with the following id's could not be retrieved:")
  for (i in seq_len(missed_doi)) {
    message("\t", missed_doi[i])
  }
  author.pretty <- make_overlap_table(author.results)
  try(Encoding(author.pretty$person) <- "latin1")
  try(author.pretty <- iconv(author.pretty$person, "latin1", "UTF-8"))
  curator.pretty <- make_overlap_table(curator.results)
  try(Encoding(curator.pretty$person) <- "latin1")
  try(curator.pretty <- iconv(curator.pretty$person, "latin1", "UTF-8"))
  save(author.results, curator.results, author.pretty, curator.pretty, missed_doi, file = outputfile)
  return(list(author.results = author.results, curator.results = curator.results, author.pretty = author.pretty, curator.pretty = curator.pretty, missed_doi = missed_doi))
}


#' Create a cache from TreeBase
#' @param outputfile Path including file name
#' @return List containing author and curator results
#' @export
make_treebase_cache <- function(outputfile = "treebasecache.RData") { # nocov start
  InvertNames <- function(author) {
    return(paste(sapply(strsplit(author, ", "), rev), collapse = " "))
  }
  all.studies <- treebase::download_metadata("", by = "all")
  unlist(all.studies[[1]])[which(names(unlist(all.studies[[1]])) == "creator")]
  author.results <- data.frame()
  for (i in sequence(length(all.studies))) {
    authors <- study.id <- NULL
    try(authors <- unlist(all.studies[[i]])[which(names(unlist(all.studies[[i]])) == "creator")])
    try(study.id <- unlist(all.studies[[i]])["identifier"])
    if (!is.null(authors)) {
      authors <- sapply(authors, InvertNames)
      author.results <- rbind(author.results, expand.grid(person = authors, study = study.id, stringsAsFactors = FALSE))
    }
    message(paste(i, "of", length(all.studies), "done."))
  }
  tb.author.pretty <- make_overlap_table(author.results)[, -3]
  tb.author.results <- author.results
  save(tb.author.results, tb.author.pretty, file = outputfile)
  return(list(tb.author.results = tb.author.results, tb.author.pretty = tb.author.pretty))
} # nocov end

#' Create an overlap table
#' @param results_table An "author.results" or "curator.results" `data.frame`
#' @return A `data.frame` with information on curators and what clades they've worked on
#' @export
make_overlap_table <- function(results_table) {
  unique.person <- unique(results_table$person)
  final.table <- data.frame()
  for (person.index in sequence(length(unique.person))) {
    local.df <- subset(results_table, results_table$person == unique.person[person.index])
    clades <- table(as.character(local.df$clade))
    names(clades)[which(nchar(names(clades)) == 0)] <- "Unknown"
    clade.string <- paste(sort(paste0(names(clades), " (", clades, ")")), collapse = ", ")
    person.split <- strsplit(unique(local.df$person), " ")[[1]]
    last.name.guess <- person.split[length(person.split)]
    final.table <- rbind(final.table, data.frame(lastname = last.name.guess, person = unique.person[person.index], study.count = nrow(local.df), clades = clade.string, stringsAsFactors = FALSE))
  }
  final.table <- final.table[order(final.table$lastname), ]
  final.table <- final.table[, -1]
  return(final.table)
}

#' Associate TreeBase authors with studies
#' @return `data.frame` with author last name, author first and other names, and comma delimited URLs for TreeBase studies
make_treebase_associations <- function() {
  all.studies <- treebase::download_metadata("", by = "all")
  unlist(all.studies[[1]])[which(names(unlist(all.studies[[1]])) == "creator")]
  author.results <- data.frame()
  for (i in sequence(length(all.studies))) {
    authors <- study.id <- NULL
    try(authors <- unlist(all.studies[[i]])[which(names(unlist(all.studies[[i]])) == "creator")])
    try(study.id <- unlist(all.studies[[i]])["identifier"])
    try(study.url <- gsub("purl.org/phylo/treebase/phylows/study/TB2:S", "https://treebase.org/treebase-web/search/study/summary.html?id=", unlist(all.studies[[i]])["identifier"]))

    try(publisher <- unlist(all.studies[[i]])["publisher"])
    try(year <- unlist(all.studies[[i]])["date"])

    if (!is.null(authors)) {
      author.results <- rbind(author.results, expand.grid(person = authors, url = paste0('<a href="', study.url, '">', year, " ", publisher, " (TreeBase)</a>"), stringsAsFactors = FALSE))
    }
    message(paste(i, "of", length(all.studies), "done."))
  }

  author.lumped <- stats::aggregate(author.results$url, by = list(author = author.results$person), FUN = paste, collapse = ", ")
  colnames(author.lumped) <- c("person", "urls")
  return(author.lumped)
}

#' Associate Open Tree of Life authors with studies
#' @return `data.frame` with author last name, author first and other names, and comma delimited URLs for OToL studies
make_otol_associations <- function() {
  # .res <- system("curl -X POST https://api.opentreeoflife.org/v3/studies/find_studies -H 'content-type:application/json'", intern=TRUE)
  # .res <- httr::POST("https://api.opentreeoflife.org/v3/studies/find_studies")
  #  res <- vapply(.res[["matched_studies"]], function(x) x[["ot:studyId"]],character(1))

  all.trees <- rbind(rotl::studies_find_trees(property = "is_deprecated",
                                              value = "false",
                                              verbose = TRUE,
                                              detailed = TRUE),
                     rotl::studies_find_trees(property = "is_deprecated",
                                              value = "true",
                                              verbose = TRUE,
                                              detailed = TRUE)
  )
  # trees <- list()
  authors <- list()
  curators <- list()
  studies <- list()
  dois <- list()
  years <- list()
  tree.count <- 0
  bad.ones <- c()
  studyids <- list()

  for (study.index in sequence(dim(all.trees)[1])) {
    study.id <- all.trees$study_ids[study.index]
    doi <- NULL
    try(doi <- gsub("https?://(dx\\.)?doi.org/", "", attr(rotl::get_publication(rotl::get_study_meta(study.id)), "DOI")))
    authors <- append(authors, NA)
    dois <- append(dois, NA)
    years <- append(years, NA)
    studyids <- append(studyids, NA)
    try(studyids[length(studyids)] <- study.id)
    try(years[length(years)] <- rotl::get_study_meta(study.id)$nexml$`^ot:studyYear`)
    if (length(doi) == 0) {
      warning(paste(study.id, "has no DOI attribute, author names will not be retrieved."))
    } else {
      try(authors[length(authors)] <- list(paste(as.character(knitcitations::bib_metadata(doi)$author))))
      try(dois[length(dois)] <- doi)
    }
    try(studies <- append(studies, study.id))
  }
  for (i in sequence(length(authors))) {
    if (!any(is.na(authors[[i]])) & length(authors[[i]]) > 0) {
      Encoding(authors[[i]]) <- "latin1"
      authors[[i]] <- iconv(gsub("^\\s+|\\s+$", "", authors[[i]]), "latin1", "UTF-8")
    }
  }
  studies <- unlist(studies)
  combined.df <- data.frame()
  invert_names <- function(x) {
    x.split <- strsplit(x, " ")[[1]]
    final.names <- paste0(gsub("^\\s+|\\s+$", "", x.split[length(x.split)]), ", ", paste0(x.split[-length(x.split)], collapse = " "))
    return(final.names)
  }
  for (i in sequence(length(authors))) {
    urls <- paste0("<a href='https://tree.opentreeoflife.org/curator/study/view/", studyids[[i]], "'>", years[[i]], " ", studyids[[i]], " (OpenTree)</a>")
    for (author.index in sequence(length(authors[[i]]))) {
      combined.df <- rbind(combined.df, data.frame(person = invert_names(authors[[i]][author.index]), url = urls, stringsAsFactors = FALSE))
    }
  }
  author.lumped <- stats::aggregate(combined.df$url, by = list(author = combined.df$person), FUN = paste, collapse = ", ")
  colnames(author.lumped) <- c("person", "urls")
  return(author.lumped)
}

#' Find all authors and where they have deposited their trees
#' @param outputfile Path including file name. NULL to prevent saving.
#' @return a `data.frame` of "person" and "urls".
make_all_associations <- function(outputfile = "depositorcache.RData") {
  tb <- make_treebase_associations()
  ot <- make_otol_associations()
  combined.df <- rbind(tb, ot)
  author.lumped <- stats::aggregate(combined.df$urls, by = list(author = combined.df$person), FUN = paste, collapse = ", ")
  colnames(author.lumped) <- c("person", "urls")
  author.lumped$person <- gsub("^\\s+|\\s+$", "", author.lumped$person)
  author.lumped <- author.lumped[order(author.lumped$person), ]
  depositors <- author.lumped
  if (!is.null(outputfile)) {
    save(depositors, file = outputfile)
  }
  return(author.lumped)
}

Any scripts or data that you put into this service are public.

datelife documentation built on July 10, 2023, 2:02 a.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

datelife
Scientific Data on Time of Lineage Divergence for Your Taxa

R/cacheauthors.R
In datelife: Scientific Data on Time of Lineage Divergence for Your Taxa

Defines functions make_all_associations make_otol_associations make_treebase_associations make_overlap_table make_treebase_cache make_contributor_cache

Documented in make_all_associations make_contributor_cache make_otol_associations make_overlap_table make_treebase_associations make_treebase_cache

Try the datelife package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

datelife Scientific Data on Time of Lineage Divergence for Your Taxa

R/cacheauthors.R In datelife: Scientific Data on Time of Lineage Divergence for Your Taxa

Defines functions make_all_associations make_otol_associations make_treebase_associations make_overlap_table make_treebase_cache make_contributor_cache

Documented in make_all_associations make_contributor_cache make_otol_associations make_overlap_table make_treebase_associations make_treebase_cache

Try the datelife package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

datelife
Scientific Data on Time of Lineage Divergence for Your Taxa

R/cacheauthors.R
In datelife: Scientific Data on Time of Lineage Divergence for Your Taxa