library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(tidytext)
library(rvest)The Complete Sherlock Holmes
The Data
This week we’re exploring the complete line-by-line text of the Sherlock Holmes stories and novels, made available through the {sherlock} R package by Emil Hvitfeldt. The dataset includes the full collection of Holmes texts, organized by book and line number, and is ideal for stylometry, sentiment analysis, and literary exploration.
“The name is Sherlock Holmes and the address is 221B Baker Street.” Holmes is a consulting detective known for his keen observation, logical reasoning, and use of forensic science to solve complex cases. Created by Sir Arthur Conan Doyle, Holmes has become one of the most famous fictional detectives in literature.
- Are there patterns in how Watson narrates versus how Holmes speaks?
- How does sentence length vary between stories?
- Can we detect shifts in tone when Watson is the narrator versus when Holmes speaks directly?
- Does sentiment shift as the mystery unfolds?
Thank you to Darakhshan Nehal for curating this week’s dataset.
holmes <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-11-18/holmes.csv') |>
mutate(book = tolower(book)) |>
mutate(book = str_squish(book))First we find the dates of each of the Sherlock Holmes texts from Wikipedia. Using regular expressions, we are able to pull out the dates.
holmes_wiki <- read_html("https://en.wikipedia.org/wiki/Canon_of_Sherlock_Holmes") |>
html_elements("p+ ol li , p+ ol a") |>
html_text() |>
as_tibble() |>
filter(!is.na(str_extract(value, "\\d{4}"))) |>
mutate(date = str_extract(value, "\\d{4}")) |>
mutate(title = str_extract(value, '^"?([^"(]+)"?')) |>
mutate(title = str_replace_all(title, '"', '')) |>
mutate(title = tolower(title)) |>
mutate(title = str_squish(title))
holmes <- holmes |>
left_join(holmes_wiki, by = c("book" = "title"))Next we use the AFINN sentiment analysis to identify the sentiment (or sentiments) associated with each line in the text.
afinn_sentiments <- tidytext::get_sentiments("afinn")
nrc_sentiments <- tidytext::get_sentiments("nrc") |>
filter(sentiment != "positive", sentiment != "negative")
sent_data <- holmes |>
tidytext::unnest_tokens(words, text) |>
anti_join(stop_words, by = c("words" = "word")) |>
left_join(nrc_sentiments, by = c("words" = "word"), relationship = "many-to-many") |>
group_by(date, sentiment) |>
summarize(n = n()) |>
filter(!is.na(sentiment)) |>
mutate(prop = n / sum(n)) |>
mutate(date = as.numeric(date))sent_data |>
ggplot(aes(x = date, y = prop)) +
geom_line(aes(color = sentiment)) +
geom_point(aes(color = sentiment)) +
scale_color_brewer(palette = "Dark2") +
geom_vline(xintercept = c(1893, 1903)) +
labs(title = "proportion of each sentiment",
y = "", x = "") +
geom_text(label = "Holmes dies", x = 1894, y = 0.22) +
geom_text(label = "Holmes resurrected", x = 1904, y = 0.22)It was too hard to put all the titles on the graph, so we list them here, by date.
holmes |>
distinct(date, book) |>
filter(!is.na(date)) |>
arrange(date) |>
gt::gt()| date | book |
|---|---|
| 1887 | a study in scarlet |
| 1890 | the sign of the four |
| 1891 | a scandal in bohemia |
| 1891 | the red-headed league |
| 1891 | a case of identity |
| 1891 | the boscombe valley mystery |
| 1891 | the five orange pips |
| 1891 | the man with the twisted lip |
| 1892 | the adventure of the blue carbuncle |
| 1892 | the adventure of the speckled band |
| 1892 | the adventure of the engineer's thumb |
| 1892 | the adventure of the noble bachelor |
| 1892 | the adventure of the beryl coronet |
| 1892 | the adventure of the copper beeches |
| 1893 | the adventure of the cardboard box |
| 1901 | the hound of the baskervilles |
| 1903 | the adventure of the empty house |
| 1903 | the adventure of the norwood builder |
| 1903 | the adventure of the dancing men |
| 1904 | the adventure of the solitary cyclist |
| 1904 | the adventure of the priory school |
| 1904 | the adventure of black peter |
| 1904 | the adventure of charles augustus milverton |
| 1904 | the adventure of the six napoleons |
| 1904 | the adventure of the three students |
| 1904 | the adventure of the golden pince-nez |
| 1904 | the adventure of the missing three-quarter |
| 1904 | the adventure of the abbey grange |
| 1904 | the adventure of the second stain |
| 1908 | the adventure of wisteria lodge |
| 1908 | the adventure of the bruce-partington plans |
| 1910 | the adventure of the devil's foot |
| 1911 | the adventure of the red circle |
| 1911 | the disappearance of lady frances carfax |
| 1913 | the adventure of the dying detective |
| 1914 | the valley of fear |
| 1917 | his last bow |
praise()[1] "You are astonishing!"