Code
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
This week we’re exploring text data from Amazon’s annual reports. The PDFs were read into R using the {pdftools} R package, and explored by TidyTuesday participant Gregory Vander Vinne in a post on his website. Note that stop words (e.g., “and”, “the”, “a”) have been removed from the data.
As a publicly-traded company, Amazon releases an annual report every year (with a December 31st year end). An annual report is essentially a summary of the company’s performance over the past year. It includes details on how well the company did financially, what goals were achieved, and what challenges it faced.
How have the words used change over time?
Are there meaningful changes in sentiment from year to year?
Which words are likely to appear together in the same annual report?
Thank you to Gregory Vander Vinne for curating this week’s dataset.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2025/2025-03-25/report_words_clean.csv')
report_words_clean
<- read_csv("statista_amazon-net-income.csv") amazon
library(paletteer)
<- paletteer_d("ggthemes::Tableau_20")[1:15]
cb_palette
|>
report_words_clean group_by(year, word) |>
summarize(count = n()) |>
arrange(year, desc(count)) |>
top_n(10) |>
ggplot(aes(x = year, y = count, group = word, color = word)) +
geom_line() +
geom_point() +
scale_color_manual(values = cb_palette)
library(tidytext)
<- get_sentiments("bing")
bing <- get_sentiments("nrc")
nrc <- get_sentiments("afinn") |>
afinn filter(!value %in% c("positive", "negative"))
|>
report_words_clean group_by(year) |>
#summarize(count = n()) |>
left_join(afinn, by = c("word")) |>
drop_na(value) |>
summarize(ave_sentiment = mean(value)) |>
full_join(amazon, by = c("year")) |>
ggplot() +
geom_line(aes(x = year, y = ave_sentiment*50000)) +
geom_line(aes(x = year, y = net_income)) +
scale_y_continuous(
"Amazon net income",
sec.axis = sec_axis(~ . / 50000, name = "Average sentiment")
)
<- c(
emotion_colors "anger" = "#E63946", # Red
"anticipation" = "#F4A261", # Orange
"disgust" = "#6A994E", # Olive green
"fear" = "#5E548E", # Dark purple
"joy" = "#F9C74F", # Yellow
"sadness" = "#577590", # Blue
"surprise" = "#F28482", # Pink
"trust" = "#90BE6D" # Green
)
|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
drop_na(sentiment) |>
ggplot(aes(x = year, fill = sentiment)) +
geom_bar(position = "fill") +
scale_fill_manual(values = emotion_colors) +
labs(y = "")
library(wordcloud2)
library(htmltools)
tagList(
$h3("anger", style = "color: #E63946; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "anger") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#E63946")
)
tagList(
$h3("anticipation", style = "color: #F4A261; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "anticipation") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#F4A261")
)
tagList(
$h3("disgust", style = "color: #6A994E; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "disgust") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#6A994E")
)
tagList(
$h3("fear", style = "color: #5E548E; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "fear") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#5E548E")
)
tagList(
$h3("joy", style = "color: #F9C74F; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "joy") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#F9C74F")
)
tagList(
$h3("sadness", style = "color: #577590; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "sadness") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#577590")
)
tagList(
$h3("surprise", style = "color: #F28482; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "surprise") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#F28482")
)
tagList(
$h3("trust", style = "color: #90BE6D; text-align: center;"),
tags|>
report_words_clean group_by(year) |>
left_join(nrc, by = c("word")) |>
filter(sentiment == "trust") |>
group_by(word) |>
summarize(freq = n()) |>
wordcloud2(data = _, color = "#90BE6D")
)