Prototype taxonomic name checking function for SEOSAW
DATE: 2025-07-09
AUTHOR: John L. Godlee
We are hoping to move the SEOSAW[1] database to use World Flora Online[2] (WFO) as its taxonomic backbone, rather than the African Plant Database[3] (APD) which we have been using since 2019.
The WorldFlora R package[4], developed by Roeland Kindt, provides code for querying a downloaded copy of the WFO database and providing taxonomic information. The key features we want for SEOSAW are: check the validity of taxonomic names in tree inventory data, fuzzy find accepted names to catch spelling errors, and provide accepted names for synonyms.
I've written a wrapper function around the code in the WorldFlora R package. Some features:
- Optionally submit a lookup table to replace unmatched names.
- Consolidate consecutive whitespaces in the WFO database to a single space, which can lead to poor matching.
- Use WorldFlora::WFO.prepare() with a default set of replacements to fix common orthographic errors.
- Use WorldFlora::WFO.one() to find the best match where multiple names are fuzzy matched by WorldFlora::WFO.match().
- Extract subspecies and variety epithets from matched names.
- Optionally return unmatched or multiply-matched names.
#' Replace taxonomic names using lookup tables
#'
#' @param x vector of species names
#' @param lookup a single dataframe or a list of dataframes containing lookup
#' tables. The first column should contain names in `x` to be changed. The
#' second column should contain the new names.
#'
#' @return Vector of corrected species names
#'
#' @details Lookup tables are run in order through the list of lookup tables, meaning
#' names may change incrementally multiple times.
#'
#' @export
#'
synonymyFix <- function(x, lookup) {
# Make list if not already
if (!inherits(lookup, "list")) {
lookup <- list(lookup)
}
# Combine lookup tables into a single dataframe
lookup_combi <- as.data.frame(fastRbind(lookup))
# Check no NAs
if (any(is.na(lookup_combi))) {
stop("Lookup table cannot contain NA entries")
}
# Do substitution
out <- lookup_combi[,2][match(x, lookup_combi[,1])]
out[is.na(out)] <- x[is.na(out)]
return(out)
}
#' Return default pattern substitution for `taxonCheck()`
#'
#' @return vector of regex patterns for use with `taxonCheck()` in argument
#' `sub.pattern`
#'
#' @export
#'
WFO.prepare_default <- function() {
c(
" indet$",
" sp[.]",
" spp[.]",
" ssp[.]",
" pl[.]",
" indet[.]",
" ind[.]",
" gen[.]",
" g[.]",
" fam[.]",
" nov[.]",
" prox[.]",
" cf[.]",
" aff[.]",
" s[.]s[.]",
" s[.]l[.]",
" p[.]p[.]",
" p[.] p[.]",
"[?]",
" inc[.]",
" stet[.]",
"nom[.] cons[.]",
"nom[.] dub[.]",
" nom[.] err[.]",
" nom[.] illeg[.]",
" nom[.] inval[.]",
" nom[.] nov[.]",
" nom[.] nud[.]",
" nom[.] obl[.]",
" nom[.] prot[.]",
" nom[.] rej[.]",
" nom[.] supp[.]",
" sensu auct[.]"
)
}
#' Correct and match taxonomic names to the World Flora Taxonomic Backbone
#'
#' @param x vector of taxonomic names
#' @param WFO.file optional file name of static copy of World Flora Online
#' Taxonomic Backbone. If not NULL, data will be reloaded from this file
#' @param WFO.data optional dataset with static copy of World Flora Online
#' Taxonomic backbone. Ignored if `WFO.file` is not NULL
#' @param lookup optional a single dataframe or a list of dataframes containing
#' lookup tables. The first column should contain names in `x` to be
#' changed. The second column should contain the new names.
#' @param ret_wfo logical, if TRUE the function stops after
#' `WorldFlora::WFO.match()` and returns the raw output from this function.
#' @param ret_unk logical, if TRUE taxa not matched in the World
#' Flora Online are returned to the user as a vector containing
#' the unmatched values. If FALSE these taxa are returned as NA.
#' @param ret_multi logical, if TRUE taxa matching multiple records in the
#' World Flora Online are returned to the user as a list with one element
#' for each original name containing the unmatched values. If FALSE the
#' "best" name is selected by `WorldFlora::WFO.one()`
#' @param sub.pattern vector with regular expressions defining sections of `x`
#' to be removed during correction of common orthographic errors by
#' `WorldFlora::WFO.prepare()`
#' @param fuzzy If larger than 0, then attempt fuzzy matching. See `WorldFlora::WFO.match()`
#' @param ... Additional arguments passed to `WorldFlora::WFO.match()`
#'
#' @return Dataframe with cleaned taxonomic names and metadata
#'
#' @details
#' Taxonomic names are matched against the World Flora Online database using
#' `WorldFlora::WFO.match()`.
#'
#' The search algorithm is as follows:
#' \enumerate{
#' \item{Optionally replace names with `lookup`}
#' \item{Correct common orthographic errors with `WorldFlora::WFO.prepare()`}
#' \item{Query `WorldFlora::WFO.match()` for accepted
#' name and taxonomic rank information}
#' \item{Optionally return multiple matches or unsuccessful matches}
#' \item{Consolidate multiple matches with `WorldFlora::WFO.one()`}
#' \item{Return formatted dataframe}
#' }
#'
#' Names that cannot be matched should be replaced with "Indet indet" in
#' `lookup`. These are replaced with NA_character_ before `WorldFlora::WFO.match()`
#'
#' @importFrom data.table fread data.table
#' @importFrom WorldFlora WFO.prepare WFO.match WFO.one
#'
#' @export
#'
taxonCheck <- function(x, WFO.file = NULL, WFO.data = NULL,
lookup = NULL, ret_wfo = FALSE, ret_unk = FALSE, ret_multi = FALSE,
sub.pattern = WFO.prepare_default(), fuzzy = 0.1, ...) {
# Check WFO data is available
if (is.null(WFO.data) & is.null(WFO.file)) {
stop("Either WFO.data or WFO.file must be provided")
}
if (is.null(WFO.data)) {
message(paste("Reading WFO data"))
if (!file.exists(WFO.file)) {
stop("If WFO.data is NULL, a valid WFO.file must be provided. See WorldFlora::WFO.download()")
}
WFO.data <- data.table::fread(WFO.file, encoding = "UTF-8")
} else {
WFO.data <- data.table::data.table(WFO.data)
}
WFO.data$scientificName <- gsub("\\s+", " ", WFO.data$scientificName)
# Get unique taxonomic names
xu <- unique(x)
# Substitute names with lookup table
if (!is.null(lookup)) {
message("Substituting names with `lookup`")
xf <- synonymyFix(xu, lookup = lookup)
} else {
xf <- xu
}
# Prepare taxonomic names for WFO query
xs <- WorldFlora::WFO.prepare(xf, sub.pattern = sub.pattern)$spec.name
# Replace Indet genera with ""
xi <- xs
xi[xi == "Indet"] <- ""
# Run WFO matching
message("Querying World Flora Online")
wfo <- WorldFlora::WFO.match(unique(xi),
WFO.data = WFO.data, Fuzzy = fuzzy, ...)
# Optionally return raw WFO output
if (ret_wfo) {
# Add original names
wfo_all <- dplyr::bind_rows(lapply(seq_along(xu), function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig, wfo[wfo$spec.name.ORIG == xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_all$taxon_name_orig)))
return(wfo_all)
}
# Consolidate to single best name per taxon
wfo_one <- WorldFlora::WFO.one(wfo, verbose = FALSE)
# Add original names
wfo_one_all <- dplyr::bind_rows(lapply(seq_along(xu), function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig, wfo_one[wfo_one$spec.name.ORIG == xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_one_all$taxon_name_orig)))
wfo_sel <- wfo_one_all[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"specificEpithet", # taxon_epithet_acc
"genus", # taxon_genus_acc
"family" # taxon_family_acc
)]
wfo_sel <- unique(wfo_sel)
# All submitted names should be included in WFO output
stopifnot(all(sort(unique(wfo_sel$spec.name.ORIG)) == sort(unique(xi))))
# Consolidate genus and species
wfo_sel$species <- trimws(paste(wfo_sel$genus, wfo_sel$specificEpithet))
wfo_sel$species <- ifelse(!wfo_sel$taxonRank %in%
c("species", "subspecies", "variety", "subvariety",
"form", "subform", "prole", "unranked"),
NA_character_, wfo_sel$species)
# Extract subsp. and var. epithets from accepted names
wfo_sel$taxon_subspecies_acc <- gsub(".*subsp\\.\\s", "", wfo_sel$scientificName)
wfo_sel$taxon_subspecies_acc[!grepl("\\ssubsp\\.\\s", wfo_sel$scientificName)] <- NA_character_
wfo_sel$taxon_variety_acc <- gsub(".*var\\.\\s", "", wfo_sel$scientificName)
wfo_sel$taxon_variety_acc[!grepl("\\svar\\.\\s", wfo_sel$scientificName)] <- NA_character_
# Fill wfo ID of synonyms
wfo_sel$Old.ID <- ifelse(wfo_sel$Old.ID == "",
wfo_sel$taxonID, wfo_sel$Old.ID)
wfo_sel$Old.name <- ifelse(wfo_sel$Old.name == "",
wfo_sel$scientificName, wfo_sel$Old.name)
# Add date of processing
wfo_sel$taxon_wfo_date <- Sys.Date()
# Create output dataframe
out <- wfo_sel[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"taxon_variety_acc",
"taxon_subspecies_acc",
"specificEpithet", # taxon_epithet_acc
"species", # taxon_species_acc
"genus", # taxon_genus_acc
"family", # taxon_family_acc
"taxon_wfo_date")]
names(out) <- c(
"taxon_name_orig",
"taxon_name_sanit",
"taxon_name_syn",
"taxon_wfo_syn",
"taxon_name_acc",
"taxon_wfo_acc",
"taxon_auth_acc",
"taxon_rank_acc",
"taxon_wfo_parent",
"taxon_variety_acc",
"taxon_subspecies_acc",
"taxon_epithet_acc",
"taxon_species_acc",
"taxon_genus_acc",
"taxon_family_acc",
"taxon_wfo_date")
# Optionally return unmatched names
if (ret_unk & any(out$taxon_name_sanit != out$taxon_name_syn, na.rm = TRUE)) {
unmatched <- out$taxon_name_orig[
(out$taxon_name_sanit != out$taxon_name_syn) |
is.na(out$taxon_name_syn) | is.na(out$taxon_name_sanit)]
warning("Some taxonomic names not matched by WFO.match, returning original names")
return(unmatched)
}
# Optionally return names with multiple matches
if (ret_multi & any(duplicated(wfo$taxon_name_orig))) {
multis <- wfo$taxon_name_orig[duplicated(wfo$taxon_name_orig)]
multis_df <- wfo[wfo$taxon_name_orig %in% multis,]
multis_list <- split(multis_df, multis_df$taxon_name_orig)
warning("Some taxonomic names matched to multiple names by WFO.match, returning options")
return(multis_list)
}
# Change "" to NA in all columns
out[] <- lapply(out, function(x) {
if (is.character(x)) {
x[x == ""] <- NA_character_
}
x
})
# All original names should be filled
stopifnot(all(!is.na(out$species[out$species_sanit != "Indet indet"])))
# Return
return(out)
}