library(tidyverse) # ggplot, lubridate, dplyr, stringr, readr...
library(praise)
March Madness
The Data
March is NCAA basketball March Madness! This week’s data is NCAA Men’s March Madness data from Nishaan Amin’s Kaggle dataset and analysis Bracketology: predicting March Madness.
<- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-03-26/team-results.csv')
team_results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-03-26/public-picks.csv') public_picks
Team performance
Caveat: some of the variables were a little difficult for me to parse. I’m not sure what “performance against” means in the PAKE and PASE variables. And at first we were looking at schools with different PAKE and PASE. But later we thought it might be more interesting to look at schools whose PAKE and PASE were both high (or low).
<- team_results |>
seed_data mutate(expectations = ifelse(PAKE >= PASE, "underseeded", "overseeded")) |>
mutate(rank_diff = PAKE-PASE) |>
#filter(abs(PAKE - PASE) > 1) |>
mutate(expect_grps = case_when(
- PASE < -2 ~ "way_under",
PAKE - PASE < -1 ~ "little_under",
PAKE - PASE < 0 ~ "under",
PAKE - PASE < 1 ~ "over",
PAKE - PASE < 2 ~ "little_over",
PAKE TRUE ~ "way_over")
|>
) mutate(expect_grps = factor(expect_grps,
levels = c("way_under", "little_under","under", "over",
"little_over", "way_over")))
|>
seed_data ggplot(aes(x = PAKE, y = PASE, color = expect_grps)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
labs(x = "Performance against Komputer ranking", y = "Performance against seed ranking", color = "expected groups") +
scale_color_manual(values = c("red", "orange", "yellow", "lightblue", "blue", "purple")) +
::geom_label_repel(data = filter(seed_data, abs(PAKE-PASE) > 1.5), mapping = aes(label = TEAM), key_glyph = "point") +
ggrepelguides(color = guide_legend(override.aes = list(size = 3)))
library(ggradar)
|>
team_results mutate(F4PERCENT = parse_number(F4PERCENT)) |>
select(TEAM, PAKE, PASE, GAMES, W, R64, F4PERCENT) |>
mutate(across(PAKE:F4PERCENT, scale)) |>
filter(TEAM %in% c( "Houston", "Butler", "Florida Atlantic", "Kansas", "Purdue"))
# A tibble: 5 × 7
TEAM PAKE[,1] PASE[,1] GAMES[,1] W[,1] R64[,1] F4PERCENT[,1]
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Butler 3.17 3.69 1.75 1.76 1.43 0.839
2 Florida Atlantic 1.49 1.44 -0.293 -0.000574 -0.892 -0.466
3 Houston 0.00421 0.892 0.973 1.08 0.562 2.39
4 Kansas 1.90 -0.764 4.38 4.88 3.18 3.19
5 Purdue -1.98 -1.87 2.05 1.63 2.60 2.30
|>
team_results mutate(F4PERCENT = parse_number(F4PERCENT)) |>
select(TEAM, F4PERCENT, PAKE, PASE, GAMES, W, R64) |>
mutate(across(F4PERCENT:R64, scale)) |>
filter(TEAM %in% c("Houston", "Butler", "Florida Atlantic", "Kansas", "Purdue")) |>
ggradar(values.radar = c("-4", "0", "4"),
grid.min = -2, grid.mid = 0, grid.max = 5,
group.line.width = 1,
group.point.size = 2)
praise()
[1] "You are great!"