Lead concentration in Flint water samples in 2015

Author

Jo Hardin

Published

November 4, 2025

Code

library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(gganimate)
library(gifski)
library(magick)

The Data

This week we are exploring lead levels in water samples collected in Flint, Michigan in 2015. The data comes from a paper by Loux and Gibson (2018) who advocate for using this data as a teaching example in introductory statistics courses.

The Flint lead data provide a compelling example for introducing students to simple univariate descriptive statistics. In addition, they provide examples for discussion of sampling and data collection, as well as ethical data handling.

The data this week includes samples collected by the Michigan Department of Environmental Quality (MDEQ) and data from a citizen science project coordinated by Prof Marc Edwards and colleagues at Virginia Tech. Community-sourced samples were collected after concerns were raised about the MDEQ excluding samples from their data. You can read about the “murky” story behind this data here.

Code

flint_mdeq <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_mdeq.csv')
flint_vt <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-04/flint_vt.csv')

The raw data

Code

n_drop <- 10
n_stay <- 5

df1 <- flint_mdeq |>
  mutate(dot_id = row_number()) |> 
  select(dot_id, lead)

total_frames <- max(df1$dot_id) + n_drop + n_stay - 1
  
df2 <- expand_grid(
    dot_id = df1$dot_id, 
    frame = 1:total_frames) |>  # cross join: each dot × frame
  left_join(df1, by = "dot_id") |>
  left_join(df1 |> mutate(start_frame = dot_id), by = "dot_id") |> 
  mutate(y = case_when(
    frame <= start_frame ~ 10,
    frame <= start_frame + n_drop - 1 ~ 10 - (frame - start_frame) * (10 / (n_drop - 1)),
    frame <= start_frame + n_drop + n_stay - 1 ~ 0,
    TRUE ~ 0)) |> 
  mutate(dot_alpha = case_when(
  y > 9.5 ~ 0,
  y == 0  ~ 0.6,
  TRUE ~ 0.3
))

Code

ref_lines <- tibble(
  x = c(15, quantile(flint_mdeq$lead, 0.9)),
  xend = c(15, quantile(flint_mdeq$lead, 0.9)),
  y = c(-0.5, -0.5),
  yend = c(10, 10),
  lty = c(1, 2)
)


labels <- tibble(
  label = c("15 ppb", "18 ppb"),
  x = c(15, 18),
  y = c(-1, -2)
)

Code

p <- ggplot(df2, aes(x = lead.x, y = y)) +
  geom_point(color = "steelblue", 
             size = 7,
             alpha = df2$dot_alpha) +
  geom_segment(data = ref_lines,
               aes(x = x, xend = xend, y = y, 
                   yend = yend), 
               linetype = c(1,2),
               inherit.aes = FALSE,
               color = "black")  +
  geom_text(data = labels,
            aes(x = x, y = y, label = label),
            inherit.aes = FALSE) +
  geom_hline(yintercept = 0) +
  ylim(-3, 12) + 
  xlim(0, 107) + 
  theme_minimal() +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 12), margin = margin(b = 10)) +
  transition_manual(frame) +
  labs(title = "Lead measurements in 71 water samples",
       x = "amount of lead in parts per billion",
       y = "")

Code

# run this R chunk before rendering
# don't evaluate on render, run through the frames in the scrollytelling

dir.create("frames", showWarnings = FALSE)

animate(p, fps = 20, width = 800, height = 300,
        renderer = file_renderer("frames", prefix = "frame_"))

Samples were collected by the Michigan Department of Environment (MDEQ) because of reports of high lead levels in the water.

If the 90th percentile of the samples was above 15 ppb (solid line), action would be required to be taken.

There is clear evidence in the raw data that the 90th percentile is above the cutoff and should activate further inquiry.

Two samples removed

Code

n_drop <- 10
n_stay <- 5

df1.2 <- flint_mdeq |>
  filter(!is.na(lead2)) |> 
  mutate(dot_id = row_number()) |> 
  select(dot_id, lead2)

total_frames.2 <- max(df1.2$dot_id) + n_drop + n_stay - 1
  
df2.2 <- expand_grid(
    dot_id = df1.2$dot_id, 
    frame = 1:total_frames.2) |>  # cross join: each dot × frame
  left_join(df1.2, by = "dot_id") |>
  left_join(df1.2 |> mutate(start_frame = dot_id), by = "dot_id") |> 
  mutate(y = case_when(
    frame <= start_frame ~ 10,
    frame <= start_frame + n_drop - 1 ~ 10 - (frame - start_frame) * (10 / (n_drop - 1)),
    frame <= start_frame + n_drop + n_stay - 1 ~ 0,
    TRUE ~ 0)) |> 
  mutate(dot_alpha = case_when(
  y > 9.5 ~ 0,
  y == 0  ~ 0.6,
  TRUE ~ 0.3
))

Code

ref_lines2 <- tibble(
  x = c(15, quantile(flint_mdeq$lead2, 0.9, na.rm = TRUE)),
  xend = c(15, quantile(flint_mdeq$lead2, 0.9, na.rm = TRUE)),
  y = c(-0.5, -0.5),
  yend = c(10, 10),
  lty = c(1, 2)
)


labels2 <- tibble(
  label = c("15 ppb", "11.4 ppb"),
  x = c(15, 11.4),
  y = c(-1, -2)
)

Code

p2 <- ggplot(df2.2, aes(x = lead2.x, y = y)) +
  geom_point(color = "steelblue", 
             size = 7,
             alpha = df2.2$dot_alpha) +
  geom_segment(data = ref_lines2,
               aes(x = x, xend = xend, y = y, 
                   yend = yend), 
               linetype = c(1,2),
               inherit.aes = FALSE,
               color = "black")  +
  geom_text(data = labels2,
            aes(x = x, y = y, label = label),
            inherit.aes = FALSE) +
  geom_hline(yintercept = 0) +
  ylim(-3, 12) + 
  xlim(0, 107) + 
  theme_minimal() +
  theme(
    axis.text.y = element_blank(),
    axis.ticks.y = element_blank(),
    panel.grid = element_blank(),
    axis.text.x = element_text(size = 12), margin = margin(b = 10)) +
  transition_manual(frame) +
  labs(title = "Lead measurements in 69 (altered) water samples",
       x = "amount of lead in parts per billion",
       y = "")

Code

# run this R chunk before rendering
# don't evaluate on render, run through the frames in the scrollytelling

dir.create("frames2", showWarnings = FALSE)

animate(p2, fps = 20, width = 800, height = 300,
        renderer = file_renderer("frames2", prefix = "frame_"))

In their report, MDEQ removed samples with 104 ppb and 20 ppb.

The 90th percentile in the report dropped from 18 ppb in the raw data to 11.4 ppb in the altered dataset.

The final report (using the altered data) did not suggest further inquiry into the lead in the water in Flint, Michigan.

Code

praise()

[1] "You are marvelous!"