Code
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)This week we’re exploring books from Project Gutenberg and the {gutenbergr} R package!
[{gutenbergr} allows you to] Download and process public domain works in the Project Gutenberg collection https://www.gutenberg.org/. Includes metadata for all Project Gutenberg works, so that they can be searched and retrieved.
Thank you to Jon Harmon, Data Science Learning Community for curating this week’s dataset.
gutenberg_authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_authors.csv')
gutenberg_languages <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_languages.csv')
gutenberg_metadata <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_metadata.csv')
gutenberg_subjects <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_subjects.csv')gutenberg_subjects |>
filter(subject_type == "lcc") |>
group_by(subject) |>
summarize(count = n()) |>
arrange(desc(count))# A tibble: 268 × 2
subject count
<chr> <int>
1 PS 12013
2 PR 10641
3 PZ 7860
4 PQ 5281
5 PT 3242
6 AP 2686
7 PH 1810
8 DA 1745
9 PN 1071
10 DS 1042
# ℹ 258 more rows
gutenberg_metadata |>
filter(stringr::str_detect(gutenberg_bookshelf, "(?i)Child"))# A tibble: 8,953 × 8
gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
<dbl> <chr> <chr> <dbl> <chr> <chr>
1 11 Alice's… Carro… 7 en Children's Literat…
2 11 Alice's… Carro… 7 en Children's Literat…
3 12 Through… Carro… 7 en Children's Literat…
4 13 The Hun… Carro… 7 en Children's Literat…
5 16 Peter P… Barri… 10 en Children's Literat…
6 19 The Son… Longf… 16 en Children's Literat…
7 41 The Leg… Irvin… 34 en Children's Literat…
8 45 Anne of… Montg… 36 en Canada/Children's …
9 46 A Chris… Dicke… 37 en Children's Literat…
10 47 Anne of… Montg… 36 en Canada/Children's …
# ℹ 8,943 more rows
# ℹ 2 more variables: rights <chr>, has_text <lgl>
gut_sum <- gutenberg_authors |>
select(gutenberg_author_id, author, birthdate, deathdate) |>
filter(!is.na(deathdate)) |>
left_join(select(gutenberg_metadata, gutenberg_id, title, gutenberg_author_id, language),
by = "gutenberg_author_id") |>
mutate(lifespan = deathdate - birthdate) |>
distinct() |>
group_by(author) |>
summarize(n_books = n(), lifespan = min(lifespan),
deathdate = min(deathdate),
language = first(language)) |>
mutate(english = str_detect(language, "en"))
p <- gut_sum |>
ungroup() |>
ggplot(aes(x = lifespan, y = n_books, color = deathdate, text = author)) +
geom_point(aes(shape = english), alpha = 0.4) +
labs(x = "lifespan", y = "",
title = "number of books in Project Gutenberg",
color = "year of death",
shape = "English")
plotly::ggplotly(p) For each author, their lifespan and number of books in Project Gutenberg are provided. The shape of the point is given by whether the document is (at least partly) in English
praise()[1] "You are first-class!"