Code
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
This week we’re exploring books from Project Gutenberg and the {gutenbergr} R package!
[{gutenbergr} allows you to] Download and process public domain works in the Project Gutenberg collection https://www.gutenberg.org/. Includes metadata for all Project Gutenberg works, so that they can be searched and retrieved.
Thank you to Jon Harmon, Data Science Learning Community for curating this week’s dataset.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_authors.csv')
gutenberg_authors <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_languages.csv')
gutenberg_languages <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_metadata.csv')
gutenberg_metadata <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-06-03/gutenberg_subjects.csv') gutenberg_subjects
|>
gutenberg_subjects filter(subject_type == "lcc") |>
group_by(subject) |>
summarize(count = n()) |>
arrange(desc(count))
# A tibble: 268 × 2
subject count
<chr> <int>
1 PS 12013
2 PR 10641
3 PZ 7860
4 PQ 5281
5 PT 3242
6 AP 2686
7 PH 1810
8 DA 1745
9 PN 1071
10 DS 1042
# ℹ 258 more rows
|>
gutenberg_metadata filter(stringr::str_detect(gutenberg_bookshelf, "(?i)Child"))
# A tibble: 8,953 × 8
gutenberg_id title author gutenberg_author_id language gutenberg_bookshelf
<dbl> <chr> <chr> <dbl> <chr> <chr>
1 11 Alice's… Carro… 7 en Children's Literat…
2 11 Alice's… Carro… 7 en Children's Literat…
3 12 Through… Carro… 7 en Children's Literat…
4 13 The Hun… Carro… 7 en Children's Literat…
5 16 Peter P… Barri… 10 en Children's Literat…
6 19 The Son… Longf… 16 en Children's Literat…
7 41 The Leg… Irvin… 34 en Children's Literat…
8 45 Anne of… Montg… 36 en Canada/Children's …
9 46 A Chris… Dicke… 37 en Children's Literat…
10 47 Anne of… Montg… 36 en Canada/Children's …
# ℹ 8,943 more rows
# ℹ 2 more variables: rights <chr>, has_text <lgl>
<- gutenberg_authors |>
gut_sum select(gutenberg_author_id, author, birthdate, deathdate) |>
filter(!is.na(deathdate)) |>
left_join(select(gutenberg_metadata, gutenberg_id, title, gutenberg_author_id, language),
by = "gutenberg_author_id") |>
mutate(lifespan = deathdate - birthdate) |>
distinct() |>
group_by(author) |>
summarize(n_books = n(), lifespan = min(lifespan),
deathdate = min(deathdate),
language = first(language)) |>
mutate(english = str_detect(language, "en"))
<- gut_sum |>
p ungroup() |>
ggplot(aes(x = lifespan, y = n_books, color = deathdate, text = author)) +
geom_point(aes(shape = english), alpha = 0.4) +
labs(x = "lifespan", y = "",
title = "number of books in Project Gutenberg",
color = "year of death",
shape = "English")
::ggplotly(p) plotly
For each author, their lifespan and number of books in Project Gutenberg are provided. The shape of the point is given by whether the document is (at least partly) in English
praise()
[1] "You are first-class!"