Text Data from Amazon’s Annual Reports

Author

Jo Hardin

Published

March 25, 2025

Code

library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)

The Data

This week we’re exploring text data from Amazon’s annual reports. The PDFs were read into R using the {pdftools} R package, and explored by TidyTuesday participant Gregory Vander Vinne in a post on his website. Note that stop words (e.g., “and”, “the”, “a”) have been removed from the data.

As a publicly-traded company, Amazon releases an annual report every year (with a December 31st year end). An annual report is essentially a summary of the company’s performance over the past year. It includes details on how well the company did financially, what goals were achieved, and what challenges it faced.

How have the words used change over time?
Are there meaningful changes in sentiment from year to year?
Which words are likely to appear together in the same annual report?

Thank you to Gregory Vander Vinne for curating this week’s dataset.

Code

report_words_clean <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-03-25/report_words_clean.csv')

amazon <- read_csv("statista_amazon-net-income.csv")

Most common words

Code

library(paletteer)
cb_palette <- paletteer_d("ggthemes::Tableau_20")[1:15]

report_words_clean |> 
  group_by(year, word) |> 
  summarize(count = n()) |> 
  arrange(year, desc(count)) |> 
  top_n(10) |> 
  ggplot(aes(x = year, y = count, group = word, color = word)) + 
  geom_line() + 
  geom_point() + 
  scale_color_manual(values = cb_palette)

Code

library(tidytext)
bing <- get_sentiments("bing")
nrc <- get_sentiments("nrc")
afinn <- get_sentiments("afinn") |> 
  filter(!value %in% c("positive", "negative"))

Code

report_words_clean |> 
  group_by(year) |> 
  #summarize(count = n()) |> 
  left_join(afinn, by = c("word")) |> 
  drop_na(value) |> 
  summarize(ave_sentiment = mean(value)) |> 
  full_join(amazon, by = c("year")) |> 
  ggplot() + 
  geom_line(aes(x = year, y = ave_sentiment*50000)) + 
  geom_line(aes(x = year, y = net_income)) + 
  scale_y_continuous(
    "Amazon net income", 
    sec.axis = sec_axis(~ . / 50000, name = "Average sentiment")
  )

Code

emotion_colors <- c(
  "anger" = "#E63946",       # Red
  "anticipation" = "#F4A261", # Orange
  "disgust" = "#6A994E",      # Olive green
  "fear" = "#5E548E",         # Dark purple
  "joy" = "#F9C74F",          # Yellow
  "sadness" = "#577590",      # Blue
  "surprise" = "#F28482",     # Pink
  "trust" = "#90BE6D"         # Green
)



report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |> 
  drop_na(sentiment) |> 
  ggplot(aes(x = year, fill = sentiment)) + 
  geom_bar(position = "fill") + 
  scale_fill_manual(values = emotion_colors) + 
  labs(y = "")

Bar chart with a separate bar for each year. The bar is filled with the proportion of words in each sentiment which are used in the Amazon annual reports. There have not been substantial changes in the sentiment of the reports over time. — Proportion of words in each sentiment category for Amazon annual reports across time. No substantial temporal trends are seen.

Code

library(wordcloud2)
library(htmltools)

tagList(
  tags$h3("anger", style = "color: #E63946; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "anger") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#E63946")
)

anger

Code

tagList(
  tags$h3("anticipation", style = "color: #F4A261; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "anticipation") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#F4A261")
)

anticipation

Code

tagList(
  tags$h3("disgust", style = "color: #6A994E; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "disgust") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#6A994E")
)

disgust

Code

tagList(
  tags$h3("fear", style = "color: #5E548E; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "fear") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#5E548E")
)

fear

Code

tagList(
  tags$h3("joy", style = "color: #F9C74F; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "joy") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#F9C74F")
)

joy

Code

tagList(
  tags$h3("sadness", style = "color: #577590; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "sadness") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#577590")
)

sadness

Code

tagList(
  tags$h3("surprise", style = "color: #F28482; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "surprise") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#F28482")
)

surprise

Code

tagList(
  tags$h3("trust", style = "color: #90BE6D; text-align: center;"),
report_words_clean |> 
  group_by(year) |> 
  left_join(nrc, by = c("word")) |>
  filter(sentiment == "trust") |> 
  group_by(word) |> 
  summarize(freq = n()) |> 
  wordcloud2(data = _, color = "#90BE6D")
)