GeoTox Package Data • GeoTox

The GeoTox package includes example data geo_tox_data. This article provides a description of the data and example code for how it was gathered.

Note

The websites linked in this vignette were last accessed on March 20, 2026. The latest R package versions were used at that time.

Warning

The package data are being pulled from various sources and are connected using FIPS codes. FIPS codes are a simple way to connect data, but they can change. For example, Connecticut began the process of going from 8 legacy counties to 9 planning regions starting in 2022 and became effective in 2024.

Setup

Load packages and create an empty list to store data.

library(dplyr)
library(httk)
library(httr2)
library(purrr)
library(readr)
library(readxl)
library(sf)
library(stringr)
library(tibble)
library(tidyr)

geo_tox_data <- list()

Chemicals

External exposure

Download modeled exposure data from AirToxScreen. Results from 2020 for a subset of chemicals in North Carolina counties are included in the GeoTox package data as geo_tox_data$exposure.

Note

The 2020 version has census block level data, which is provided in several large files separated by region. The 2019 version has census tract level data, which is provided as a single file and is much smaller to download.

filename <- "Region4b_2020ATS_Exposure_Concentrations.xlsx"
tmp <- tempfile(filename)
download.file(
  paste0(
    "https://gaftp.epa.gov/rtrmodeling_public/AirToxScreen/2020/",
    "Exposure%20Concentrations/",
    filename
  ),
  tmp
)
exposure <- read_xlsx(tmp)

# Normalization function
min_max_norm = function(x) {
  min_x <- min(x, na.rm = TRUE)
  max_x <- max(x, na.rm = TRUE)
  if (min_x == max_x) {
    rep(0, length(x))
  } else {
    (x - min_x) / (max_x - min_x)
  }
}

geo_tox_data$exposure <- exposure |>
  # North Carolina counties
  filter(State == "NC", !grepl("0$", FIPS)) |>
  # Aggregate chemicals by county
  summarize(
    across(
      -c(State:Population), # Note: use "-c(State:Tract)" for 2019 data
      list(mean = mean, sd = sd),
      .names = "{col}|||{fn}"
    ),
    .by = FIPS
  ) |>
  pivot_longer(-FIPS) |>
  separate_wider_delim(name, "|||", names = c("chemical", "stat")) |>
  pivot_wider(names_from = stat) |>
  # Normalize concentrations
  mutate(norm = min_max_norm(mean), .by = chemical)

Get chemical identifiers from CompTox Dashboard

The exposure data is labeled with chemical names, but names can vary across resources and a more reliable way to connect datasets is by using Chemical Abstracts Service Registry Number (CAS-RN) identifiers. Use the batch search feature of the CompTox Dashboard to retrieve CAS-RN and PREFERRED_NAME for the chemicals in the exposure dataset.

# Copy/paste the chemical names into the batch search field
# https://comptox.epa.gov/dashboard/batch-search
cat(geo_tox_data$exposure |> distinct(chemical) |> pull(), sep = "\n")
# Export results with "CAS-RN" identifiers as a csv file, then process in R

# Remove rows without clear chemical matches, investigate manually if desired
exposure_casrn <- read_csv("CCD-Batch-Search.csv") |>
  filter(DTXSID != "N/A", !grepl("WARNING", FOUND_BY))

# Update exposure data with CompTox Dashboard data
geo_tox_data$exposure <- geo_tox_data$exposure |>
  inner_join(exposure_casrn, by = join_by(chemical == INPUT)) |>
  select(FIPS, casn = CASRN, chnm = PREFERRED_NAME, mean, sd, norm)

Active chemicals

Use the Integrated Chemical Environment (ICE) curated high-throughput screening (cHTS) data to identify active chemicals for a given set of assays.

Note

The following can be used to get a list of available assays for a given chemical set.

# Get all supported assays
help_text <- request("https://ice.ntp.niehs.nih.gov/api/v1/search/help") |>
  req_perform() |>
  resp_body_string()
supported_assays <- str_split_i(help_text, "Supported assay\\(s\\):", 2) |>
  str_split_1("\",\"") |>
  str_replace_all(c("\"" = "")) |>
  str_trim()

# Search for assays available for given chemids
chemids <- unique(geo_tox_data$exposure$casn)
resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/search") |>
  req_body_json(list(chemids = chemids, assays = supported_assays)) |>
  req_perform()
result <- resp |> resp_body_json() |> pluck("endPoints")
fields <- c("assay", "casrn", "dtxsid", "substanceName", "endpoint", "value")
df <- map(fields, \(x) map_chr(result, x)) |>
  set_names(fields) |>
  bind_cols()
available_assays <- df |> distinct(assay) |> pull() |> sort()

get_cHTS_hits <- function(assays = NULL, chemids = NULL) {
  if (is.null(assays) & is.null(chemids)) {
    stop("Must provide at least one of 'assays' or 'chemids'")
  }

  # Format query parameters
  req_params <- list()

  if (!is.null(assays)) {
    if (!is.list(assays)) assays <- as.list(assays)
    req_params$assays <- assays
  }

  if (!is.null(chemids)) {
    if (!is.list(chemids)) chemids <- as.list(chemids)
    req_params$chemids <- chemids
  }

  # Query ICE API
  resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/search") |>
    req_body_json(req_params) |>
    req_perform()

  if (resp$status_code != 200) {
    stop("Failed to retrieve data from ICE API")
  }

  # Return active chemicals
  result <- resp |> resp_body_json() |> pluck("endPoints")

  fields <- c("assay", "casrn", "dtxsid", "substanceName", "endpoint", "value")

  map(fields, \(x) map_chr(result, x)) |>
    set_names(fields) |>
    bind_cols() |>
    filter(endpoint == "Call", value == "Active") |>
    select(-c(endpoint, value)) |>
    distinct()
}

assays <- c(
  "APR_HepG2_p53Act_1hr",
  "APR_HepG2_p53Act_24hr",
  "APR_HepG2_p53Act_72hr",
  "ATG_p53_CIS",
  "TOX21_DT40_LUC",
  "TOX21_DT40_100_LUC",
  "TOX21_DT40_657_LUC",
  "TOX21_ELG1_LUC_Agonist",
  "TOX21_H2AX_HTRF_CHO_Agonist_ratio",
  "TOX21_p53_BLA_p1_ratio",
  "TOX21_p53_BLA_p2_ratio",
  "TOX21_p53_BLA_p3_ratio",
  "TOX21_p53_BLA_p4_ratio",
  "TOX21_p53_BLA_p5_ratio"
)

chemids <- unique(geo_tox_data$exposure$casn)

cHTS_hits <- get_cHTS_hits(assays = assays, chemids = chemids)

Dose-response

Use the ICE API to retrieve dose-response data for selected assays and chemicals, which is included in the GeoTox package data as geo_tox_data$dose_response.

get_ICE_dose_resp <- function(assays = NULL, chemids = NULL) {
  if (is.null(assays) & is.null(chemids)) {
    stop("Must provide at least one of 'assays' or 'chemids'")
  }

  # Format query parameters
  req_params <- list()

  if (!is.null(assays)) {
    if (!is.list(assays)) assays <- as.list(assays)
    req_params$assays <- assays
  }

  if (!is.null(chemids)) {
    if (!is.list(chemids)) chemids <- as.list(chemids)
    req_params$chemids <- chemids
  }

  # Query ICE API
  resp <- request("https://ice.ntp.niehs.nih.gov/api/v1/curves") |>
    req_body_json(req_params) |>
    req_perform()

  if (resp$status_code != 200) {
    stop("Failed to retrieve data from ICE API")
  }

  # Return dose-response data
  result <- resp |> resp_body_json() |> pluck("curves")

  map(result, function(x) {
    tibble(
      endp = x[["endpoint"]],
      casn = x[["casrn"]],
      chnm = x[["substance"]],
      call = x[["call"]],
      logc = map_dbl(x[["concentrationResponses"]], "concentration") |> log10(),
      resp = map_dbl(x[["concentrationResponses"]], "response")
    )
  }) |>
    bind_rows()
}

assays <- unique(cHTS_hits$assay)

chemids <- intersect(cHTS_hits$casrn, geo_tox_data$exposure$casn)

dose_response <- get_ICE_dose_resp(assays = assays, chemids = chemids)

# Only keep active calls for assay/chemical combinations
geo_tox_data$dose_response <- dose_response |>
  filter(call == "Active") |>
  select(-call)

# Update dose-response chemical names using CompTox Dashboard data
geo_tox_data$dose_response <- geo_tox_data$dose_response |>
  inner_join(exposure_casrn, by = join_by(casn == CASRN)) |>
  select(endp, casn, chnm = PREFERRED_NAME, logc, resp)

Population

Age

Download age data from the U.S. Census Bureau by searching for “County Population by Characteristics”. A subset of data for North Carolina from 2020 is included in the GeoTox package data as geo_tox_data$age.

# Data for North Carolina
url <- paste0(
  "https://www2.census.gov/programs-surveys/popest/datasets/",
  "2020-2024/counties/asrh/cc-est2024-alldata-37.csv"
)
age <- read_csv(url)

geo_tox_data$age <- age |>
  # 7/1/2020 population estimate
  filter(YEAR == 2) |>
  # Create FIPS
  mutate(FIPS = str_c(STATE, COUNTY)) |>
  # Keep selected columns
  select(FIPS, AGEGRP, TOT_POP)

Obesity status

Search CDC data for “places county data”. Go to the desired dataset webpage, e.g. 2020 county data, and download the data by selecting Actions → API → Download file. A subset of data for North Carolina is included in the GeoTox package data as geo_tox_data$obesity.

places <- read_csv(
  "PLACES__County_Data_(GIS_Friendly_Format),_2020_release.csv"
)

# Convert confidence interval to standard deviation
extract_SD <- function(x) {
  range <- as.numeric(str_split_1(str_sub(x, 2, -2), ","))
  diff(range) / 3.92
}

geo_tox_data$obesity <- places |>
  # North Carolina Counties
  filter(StateAbbr == "NC") |>
  # Select obesity data
  select(FIPS = CountyFIPS, OBESITY_CrudePrev, OBESITY_Crude95CI) |>
  # Change confidence interval to standard deviation
  rowwise() |>
  mutate(OBESITY_SD = extract_SD(OBESITY_Crude95CI)) |>
  ungroup() |>
  select(-OBESITY_Crude95CI)

Steady-state plasma concentration (C_ss)

Use the httk package to generate C_ss values for combinations of age group and weight status for each chemical. The generation of these values is a time-intensive step, so one approach is to generate populations of C_ss values initially and then sample them later. The simulated C_ss values are included in the GeoTox package data as geo_tox_data$simulated_css.

set.seed(2345)
n_samples <- 500

# Get CASN for which httk simulation is possible. Try using load_dawson2021,
# load_sipes2017, or load_pradeep2020 to increase availability.
load_sipes2017()
casn <- intersect(
  unique(geo_tox_data$dose_response$casn),
  get_cheminfo(suppress.messages = TRUE)
)

# Define population demographics for httk simulation
pop_demo <- cross_join(
  tibble(age_group = list(
    c(0, 2), c(3, 5), c(6, 10), c(11, 15), c(16, 20), c(21, 30), c(31, 40),
    c(41, 50), c(51, 60), c(61, 70), c(71, 100)
  )),
  tibble(weight = c("Normal", "Obese"))
)

# Create wrapper function around httk steps
simulate_css <- function(
  chem.cas,
  agelim_years,
  weight_category,
  samples,
  verbose = TRUE
) {
  if (verbose) {
    cat(
      chem.cas,
      paste0("(", paste(agelim_years, collapse = ", "), ")"),
      weight_category,
      "\n"
    )
  }

  httkpop <- list(
    method = "vi",
    gendernum = NULL,
    agelim_years = agelim_years,
    agelim_months = NULL,
    weight_category = weight_category,
    reths = c(
      "Mexican American",
      "Other Hispanic",
      "Non-Hispanic White",
      "Non-Hispanic Black",
      "Other"
    )
  )

  css <- try(
    suppressWarnings({
      mcs <- create_mc_samples(
        chem.cas = chem.cas,
        samples = samples,
        httkpop.generate.arg.list = httkpop,
        suppress.messages = TRUE
      )

      calc_analytic_css(
        chem.cas = chem.cas,
        parameters = mcs,
        model = "3compartmentss",
        suppress.messages = TRUE
      )
    }),
    silent = TRUE
  )

  # Return
  if (is(css, "try-error")) {
    warning("simulate_css() failed to generate data for CASN ", chem.cas)
    list(NA)
  } else {
    list(css)
  }
}

# Simulate C_ss values
simulated_css <- map(casn, function(chem.cas) {
  pop_demo |>
    rowwise() |>
    mutate(
      css = simulate_css(.env$chem.cas, age_group, weight, .env$n_samples)
    ) |>
    ungroup()
})
simulated_css <- setNames(simulated_css, casn)

# Remove CASN that failed simulate_css
casn_keep <- map_lgl(simulated_css, function(df) {
  !(length(df$css[[1]]) == 1 && is.na(df$css[[1]]))
})
simulated_css <- simulated_css[casn_keep]

geo_tox_data$simulated_css <- simulated_css |>
  map(\(x) {
    x |>
      mutate(
        age_lb = map_int(age_group, first),
        age_ub = map_int(age_group, last)
      ) |>
      select(age_lb, age_ub, weight, css) |>
      unnest(css)
  }) |>
  enframe(name = "casn") |>
  unnest(value)

Retain overlap

Retain only those chemicals found in exposure, dose-response and C_ss datasets.

casn_keep <- unique(geo_tox_data$simulated_css$casn)

geo_tox_data$exposure <- geo_tox_data$exposure |>
  filter(casn %in% casn_keep)

geo_tox_data$dose_response <- geo_tox_data$dose_response |>
  filter(casn %in% casn_keep)

County/State boundaries

Download cartographic boundary files for counties and states from the U.S. Census Bureau. The geometry data for North Carolina counties and the state are included in the GeoTox package data as geo_tox_data$boundaries.

county <- st_read("cb_2020_us_county_5m/cb_2020_us_county_5m.shp")
state <- st_read("cb_2020_us_state_5m/cb_2020_us_state_5m.shp")

geo_tox_data$boundaries <- list(
  county = county |>
    filter(STATEFP == 37) |>
    select(FIPS = GEOID, geometry),
  state = state |>
    filter(STATEFP == 37) |>
    select(geometry)
)