Code
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(gganimate)
library(gifski)
library(magick)library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(gganimate)
library(gifski)
library(magick)This week we are exploring lead levels in water samples collected in Flint, Michigan in 2015. The data comes from a paper by Loux and Gibson (2018) who advocate for using this data as a teaching example in introductory statistics courses.
The Flint lead data provide a compelling example for introducing students to simple univariate descriptive statistics. In addition, they provide examples for discussion of sampling and data collection, as well as ethical data handling.
The data this week includes samples collected by the Michigan Department of Environmental Quality (MDEQ) and data from a citizen science project coordinated by Prof Marc Edwards and colleagues at Virginia Tech. Community-sourced samples were collected after concerns were raised about the MDEQ excluding samples from their data. You can read about the “murky” story behind this data here.
flint_mdeq <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_mdeq.csv')
flint_vt <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_vt.csv')n_drop <- 10
n_stay <- 5
df1 <- flint_mdeq |>
mutate(dot_id = row_number()) |>
select(dot_id, lead)
total_frames <- max(df1$dot_id) + n_drop + n_stay - 1
df2 <- expand_grid(
dot_id = df1$dot_id,
frame = 1:total_frames) |> # cross join: each dot × frame
left_join(df1, by = "dot_id") |>
left_join(df1 |> mutate(start_frame = dot_id), by = "dot_id") |>
mutate(y = case_when(
frame <= start_frame ~ 10,
frame <= start_frame + n_drop - 1 ~ 10 - (frame - start_frame) * (10 / (n_drop - 1)),
frame <= start_frame + n_drop + n_stay - 1 ~ 0,
TRUE ~ 0)) |>
mutate(dot_alpha = case_when(
y > 9.5 ~ 0,
y == 0 ~ 0.6,
TRUE ~ 0.3
))ref_lines <- tibble(
x = c(15, quantile(flint_mdeq$lead, 0.9)),
xend = c(15, quantile(flint_mdeq$lead, 0.9)),
y = c(-0.5, -0.5),
yend = c(10, 10),
lty = c(1, 2)
)
labels <- tibble(
label = c("15 ppb", "18 ppb"),
x = c(15, 18),
y = c(-1, -2)
)p <- ggplot(df2, aes(x = lead.x, y = y)) +
geom_point(color = "steelblue",
size = 7,
alpha = df2$dot_alpha) +
geom_segment(data = ref_lines,
aes(x = x, xend = xend, y = y,
yend = yend),
linetype = c(1,2),
inherit.aes = FALSE,
color = "black") +
geom_text(data = labels,
aes(x = x, y = y, label = label),
inherit.aes = FALSE) +
geom_hline(yintercept = 0) +
ylim(-3, 12) +
xlim(0, 107) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12), margin = margin(b = 10)) +
transition_manual(frame) +
labs(title = "Lead measurements in 71 water samples",
x = "amount of lead in parts per billion",
y = "")# run this R chunk before rendering
# don't evaluate on render, run through the frames in the scrollytelling
dir.create("frames", showWarnings = FALSE)
animate(p, fps = 20, width = 800, height = 300,
renderer = file_renderer("frames", prefix = "frame_"))n_drop <- 10
n_stay <- 5
df1.2 <- flint_mdeq |>
filter(!is.na(lead2)) |>
mutate(dot_id = row_number()) |>
select(dot_id, lead2)
total_frames.2 <- max(df1.2$dot_id) + n_drop + n_stay - 1
df2.2 <- expand_grid(
dot_id = df1.2$dot_id,
frame = 1:total_frames.2) |> # cross join: each dot × frame
left_join(df1.2, by = "dot_id") |>
left_join(df1.2 |> mutate(start_frame = dot_id), by = "dot_id") |>
mutate(y = case_when(
frame <= start_frame ~ 10,
frame <= start_frame + n_drop - 1 ~ 10 - (frame - start_frame) * (10 / (n_drop - 1)),
frame <= start_frame + n_drop + n_stay - 1 ~ 0,
TRUE ~ 0)) |>
mutate(dot_alpha = case_when(
y > 9.5 ~ 0,
y == 0 ~ 0.6,
TRUE ~ 0.3
))ref_lines2 <- tibble(
x = c(15, quantile(flint_mdeq$lead2, 0.9, na.rm = TRUE)),
xend = c(15, quantile(flint_mdeq$lead2, 0.9, na.rm = TRUE)),
y = c(-0.5, -0.5),
yend = c(10, 10),
lty = c(1, 2)
)
labels2 <- tibble(
label = c("15 ppb", "11.4 ppb"),
x = c(15, 11.4),
y = c(-1, -2)
)p2 <- ggplot(df2.2, aes(x = lead2.x, y = y)) +
geom_point(color = "steelblue",
size = 7,
alpha = df2.2$dot_alpha) +
geom_segment(data = ref_lines2,
aes(x = x, xend = xend, y = y,
yend = yend),
linetype = c(1,2),
inherit.aes = FALSE,
color = "black") +
geom_text(data = labels2,
aes(x = x, y = y, label = label),
inherit.aes = FALSE) +
geom_hline(yintercept = 0) +
ylim(-3, 12) +
xlim(0, 107) +
theme_minimal() +
theme(
axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
panel.grid = element_blank(),
axis.text.x = element_text(size = 12), margin = margin(b = 10)) +
transition_manual(frame) +
labs(title = "Lead measurements in 69 (altered) water samples",
x = "amount of lead in parts per billion",
y = "")# run this R chunk before rendering
# don't evaluate on render, run through the frames in the scrollytelling
dir.create("frames2", showWarnings = FALSE)
animate(p2, fps = 20, width = 800, height = 300,
renderer = file_renderer("frames2", prefix = "frame_"))praise()[1] "You are marvelous!"