For ages, I’ve been seeing the art and posts by Allison Horst and Alison Hill and others about this penguin dataset. I’m so excited to have a chance to work with the data, because I expect it to become a staple in my teaching. Love the idea of using the penguin dataset, among many reasons because it is connected to scientific queries and research. From the palmerpenguin vignette:
Data were collected and made available by Dr. Kristen Gorman and the Palmer Station, Antarctica LTER, a member of the Long Term Ecological Research Network.
Existing in their own package, palmerpenguins, the data are straightforward to download. Yay for data packages!!
head(penguins)
## # A tibble: 6 x 8
## species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex
## <fct> <fct> <dbl> <dbl> <int> <int> <fct>
## 1 Adelie Torge… 39.1 18.7 181 3750 male
## 2 Adelie Torge… 39.5 17.4 186 3800 fema…
## 3 Adelie Torge… 40.3 18 195 3250 fema…
## 4 Adelie Torge… NA NA NA NA <NA>
## 5 Adelie Torge… 36.7 19.3 193 3450 fema…
## 6 Adelie Torge… 39.3 20.6 190 3650 male
## # … with 1 more variable: year <int>
ggplot(penguins,aes(x = flipper_length_mm, y = bill_length_mm,
color = species)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
facet_wrap(~island)
One of the main tools I use in introductory statistics is the infer package which includes methods for inference through randomization, bootstrapping, and mathematical models.
\(H_0: p_1 - p_2 = 0\)
don’t reject \(H_0\)
stat_2prop <- penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
specify(sex ~ species, success = "female") %>%
calculate(stat = "diff in props", order = c("Gentoo", "Chinstrap"))
null_2prop <- penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
specify(sex ~ species, success = "female") %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in props", order = c("Gentoo", "Chinstrap"))
null_2prop %>%
infer::visualize() +
shade_p_value(obs_stat = stat_2prop, direction = "two-sided")
penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
prop_test(sex ~ species, order = c("Gentoo", "Chinstrap"))
## # A tibble: 1 x 6
## statistic chisq_df p_value alternative lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 0.000191 1 0.989 two.sided -0.160 0.137
\(H_0:\) variables (species and island) are independent
definitely reject \(H_0\)
stat_chisq <- penguins %>%
specify(island ~ species) %>%
calculate(stat = "Chisq")
null_chisq <- penguins %>%
specify(island ~ species) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "Chisq")
null_chisq %>%
infer::visualize() +
shade_p_value(obs_stat = stat_chisq, direction = "greater")
null_chisq %>%
infer::visualize(method = "theoretical") +
shade_p_value(obs_stat = stat_chisq, direction = "greater")
CI for the true mean weight of Gentoo penguins.
stat_mean <- penguins %>%
filter(species == "Gentoo") %>%
specify(response = body_mass_g) %>%
calculate(stat = "mean")
boot_mean <- penguins %>%
filter(species == "Gentoo") %>%
specify(response = body_mass_g) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "mean")
ci_perc_mean <- boot_mean %>%
get_confidence_interval(level = 0.9, type = "percentile")
ci_se_mean <- boot_mean %>%
get_confidence_interval(level = 0.9, type = "se",
point_estimate = stat_mean)
boot_mean %>%
infer::visualize() +
infer::shade_confidence_interval(ci_perc_mean,
color = "#569BBD",
fill = NULL, size = .5) +
infer::shade_confidence_interval(ci_se_mean,
color = "#4C721D",
fill = NULL, size = .5) +
ggtitle("Simulation-Based Bootstrap Distribution for Mean") +
geom_vline(xintercept = stat_mean$stat) +
geom_line(aes(y = replicate, x = stat, color = "a"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "b"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "c"), alpha = 0) + # bogus code
ylim(c(0,200)) +
guides(color = guide_legend(override.aes = list(alpha = 1))) +
scale_color_manual(name = "CI type",
values = c("a" = "#569BBD", "b" = "#4C721D", "c" = "#000000"),
labels = c("Percentile", "SE", "Observed stat"),
guide = "legend")
CI for the true median weight of Gentoo penguins.
stat_median <- penguins %>%
filter(species == "Gentoo") %>%
specify(response = body_mass_g) %>%
calculate(stat = "median")
boot_median <- penguins %>%
filter(species == "Gentoo") %>%
specify(response = body_mass_g) %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "median")
ci_perc_median <- boot_median %>%
get_confidence_interval(level = 0.9, type = "percentile")
ci_se_median <- boot_median %>%
get_confidence_interval(level = 0.9, type = "se",
point_estimate = stat_median)
boot_median %>%
infer::visualize() +
infer::shade_confidence_interval(ci_perc_median,
color = "#569BBD",
fill = NULL, size = .5) +
infer::shade_confidence_interval(ci_se_median,
color = "#4C721D",
fill = NULL, size = .5) +
ggtitle("Simulation-Based Bootstrap Distribution for Median") +
geom_vline(xintercept = stat_median$stat) +
geom_line(aes(y = replicate, x = stat, color = "a"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "b"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "c"), alpha = 0) + # bogus code
ylim(c(0,200)) +
guides(color = guide_legend(override.aes = list(alpha = 1))) +
scale_color_manual(name = "CI type",
values = c("a" = "#569BBD", "b" = "#4C721D", "c" = "#000000"),
labels = c("Percentile", "SE", "Observed stat"),
guide = "legend")
\(H_0: \mu_1 - \mu_2 = 0\)
reject \(H_0\)
stat_2mean <- penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
specify(body_mass_g ~ species) %>%
calculate(stat = "diff in means", order = c("Gentoo", "Chinstrap"))
null_2mean <- penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
specify(body_mass_g ~ species) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "permute") %>%
calculate(stat = "diff in means", order = c("Gentoo", "Chinstrap"))
null_2mean %>%
infer::visualize() +
shade_p_value(obs_stat = stat_2mean, direction = "two-sided")
penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
t_test(body_mass_g ~ species, order = c("Gentoo", "Chinstrap"))
## # A tibble: 1 x 6
## statistic t_df p_value alternative lower_ci upper_ci
## <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 20.6 170. 3.41e-48 two.sided 1214. 1471.
CI for \(\mu_1 - \mu_2\), the true difference in body weight for Gentoo vs Chinstrap penguins.
boot_2mean <- penguins %>%
filter(species %in% c("Gentoo", "Chinstrap")) %>%
specify(body_mass_g ~ species) %>%
hypothesize(null = "independence") %>%
generate(reps = 1000, type = "bootstrap") %>%
calculate(stat = "diff in means", order = c("Gentoo", "Chinstrap"))
ci_perc_2mean <- boot_2mean %>%
get_confidence_interval(level = 0.9, type = "percentile")
ci_se_2mean <- boot_2mean %>%
get_confidence_interval(level = 0.9, type = "se",
point_estimate = stat_2mean)
boot_2mean %>%
infer::visualize() +
infer::shade_confidence_interval(ci_perc_2mean,
color = "#569BBD",
fill = NULL, size = .5) +
infer::shade_confidence_interval(ci_se_2mean,
color = "#4C721D",
fill = NULL, size = .5) +
ggtitle("Simulation-Based Bootstrap Distribution for Diff in Means") +
geom_vline(xintercept = stat_2mean$stat) +
geom_line(aes(y = replicate, x = stat, color = "a"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "b"), alpha = 0) + # bogus code
geom_line(aes(y = replicate, x = stat, color = "c"), alpha = 0) + # bogus code
ylim(c(0,200)) +
guides(color = guide_legend(override.aes = list(alpha = 1))) +
scale_color_manual(name = "CI type",
values = c("a" = "#569BBD", "b" = "#4C721D", "c" = "#000000"),
labels = c("Percentile", "SE", "Observed stat"),
guide = "legend")