library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(tidytext)
library(praise)
library(sf)
library(paletteer)
library(topicmodels)
Snopes - horror articles
The data
The data explore horror legends from Snopes.com.
data(stop_words)
<- read_csv("horror_articles.csv")
snopes
<- snopes |>
snopes_text select(claim, rating) |>
cbind(article = 1:253) |>
mutate(rating = case_when(
== "partly true" ~ "true",
rating == "mostly false" ~ "false",
rating == "legend" ~ "false",
rating == "probably false" ~ "false",
rating == "was true; now outdated" ~ "true",
rating == "mixture" ~ "true",
rating TRUE ~ rating
|>
)) rename(text = claim)
|>
snopes select(rating) |>
table()
rating
false legend miscaptioned
113 61 1
mixture mostly false partly true
10 5 2
probably false true undetermined
1 50 6
unproven was true; now outdated
3 1
|>
snopes_text select(rating) |>
table()
rating
false miscaptioned true undetermined unproven
180 1 63 6 3
<- snopes_text |>
snopes_words unnest_tokens(word, text, drop = FALSE) |>
group_by(article, word, rating) |>
summarize(count = n()) |>
ungroup() |>
anti_join(stop_words)
LDA
The goal is to figure out if the topics created through an LDA model will be similar for those articles which are validated to be true compared with those which have been disproven. As seen in the graphs, there are not obvious trends differentiating the two types of articles. Additionally, the topics within an article also seem quite similar, all eight of them focusing on death and killing.
#| fig.alt: Two sets of bar plots showing the value of beta from an LDA model. The first set is for Snopes claims that have been validated as true. Four different groups of words are given to represent four different topics. The second set is for Snopes claims that have been shown to be false. Four different groups of words are given to represent four different topics. All eight topics center around death and killing, and there are not obvious distinctions across the topic.
#| fig.cap: For each type of claim (True vs False), four sets of words (four topics) are given. All eight topics center around death and killing, and there are not obvious distinctions across the topic. Data credit: snopes.com
## TRUE reports
<- snopes_words |>
snopes_dtm_T filter(rating == "true") |>
cast_dtm(article, word, count)
<- LDA(snopes_dtm_T, k = 4, control = list(seed = 47))
snopes_lda_T <- tidy(snopes_lda_T, matrix = "beta")
snopes_topics_T
<- snopes_topics_T %>%
snopes_top_terms_T group_by(topic) %>%
slice_max(beta, n = 5) %>%
ungroup() %>%
arrange(topic, -beta)
## FALSE reports
<- snopes_words |>
snopes_dtm_F filter(rating == "false") |>
cast_dtm(article, word, count)
<- LDA(snopes_dtm_F, k = 4, control = list(seed = 47))
snopes_lda_F <- tidy(snopes_lda_F, matrix = "beta")
snopes_topics_F
<- snopes_topics_F %>%
snopes_top_terms_F group_by(topic) %>%
slice_max(beta, n = 5) %>%
ungroup() %>%
arrange(topic, -beta)
## Plotting
%>%
snopes_top_terms_T mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered() +
ggtitle("True Snopes Claims")
%>%
snopes_top_terms_F mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered() +
ggtitle("False Snopes Claims")
Gamma
<- tidy(snopes_lda_T, matrix = "gamma")
snopes_gamma_T
<- tidy(snopes_lda_F, matrix = "gamma")
snopes_gamma_F
<- cbind(snopes_gamma_T, rating = "true") |>
snopes_gamma rbind(cbind(snopes_gamma_F, rating = "false"))
%>%
snopes_gamma ggplot(aes(y = gamma, x = as.factor(topic))) +
geom_jitter() +
facet_wrap(~rating)
|>
snopes mutate(rating = case_when(
== "partly true" ~ "true",
rating == "mostly false" ~ "false",
rating == "legend" ~ "false",
rating == "probably false" ~ "false",
rating == "was true; now outdated" ~ "true",
rating == "mixture" ~ "true",
rating TRUE ~ rating
|>
)) filter(rating %in% c("true", "false")) |>
ggplot(aes(x = author, fill = rating)) +
geom_bar(position = position_dodge2(preserve = "single")) +
guides(x = guide_axis(angle = 45))
|>
snopes mutate(rating = case_when(
== "partly true" ~ "true",
rating == "mostly false" ~ "false",
rating == "legend" ~ "false",
rating == "probably false" ~ "false",
rating == "was true; now outdated" ~ "true",
rating == "mixture" ~ "true",
rating TRUE ~ rating
|>
)) filter(rating %in% c("true", "false")) |>
ggplot(aes(x = author, fill = rating)) +
geom_bar() +
guides(x = guide_axis(angle = 45))
praise()
[1] "You are dandy!"