The data this week comes from the Ergast API, which has a CC-BY license. H/t to Sara Stoudt for sharing the link to the data by way of Data is Plural!
lap_times <- readr::read_csv("lap_times.csv", na = c("", "NA", "\\N"))
races <- readr::read_csv("races.csv", na = c("", "NA", "\\N"))
results <- readr::read_csv("results.csv", na = c("", "NA", "\\N"))
constructors <- readr::read_csv("constructors.csv", na = c("", "NA", "\\N"))
The ideas for combining the datasets and looking at race time come from: https://github.com/nrennie/tidytuesday/tree/main/2021/07-09-2021
races_joined <- left_join(left_join(dplyr::filter(results, positionText == 1), races, by = "raceId"),
constructors, by = "constructorId") %>%
mutate(win_time = milliseconds / laps) %>%
select(resultId, constructorId, laps, milliseconds, fastestLap,
fastestLapSpeed, year, name.x, win_time, nationality) %>%
filter(name.x %in% c("Monaco Grand Prix", "British Grand Prix", "Italian Grand Prix", "French Grand Prix", "German Grand Prix")) %>%
filter(constructorId %in% c(1, 3, 6, 32, 131))
mean_races <- races_joined %>% group_by(year) %>%
summarize(mean_ms = mean(as.numeric(milliseconds)),
mean_win = mean(as.numeric(win_time)))
ggplot() +
geom_line(mean_races,
mapping = aes(x = year, y = mean_ms/1000/60/60), color = "black") +
geom_point(races_joined,
mapping = aes(x = year, y = as.numeric(milliseconds)/1000/60/60,
color = nationality)) +
labs(x = "", y = "", subtitle = "Winning Time (hours)")
Whoa! There are some huge outliers. Turns out that in 2021 the Belgian Grand Priz was only one lap! Which makes me think that the winning time should be milliseconds/laps
. I’m going to replot the data with a more appropriate value on the y-axis.
ggplot() +
geom_line(mean_races, mapping = aes(x = year, y = mean_win/1000/60/60), color = "black") +
geom_point(races_joined, mapping = aes(x = year, y = as.numeric(win_time)/1000/60/60, color = nationality)) +
labs(x = "", y = "", subtitle = "Winning Time (hours)")
The goal of the model will be to predict win_time
(= milliseconds
/ laps
) using constructorId
(1, 3, 6, 32, 131: 5 levels), fastestLap
, fastestLapSpeed
, year
, name.x
(5 levels), and nationality
(10 levels).
set.seed(4747)
formula1_split <- races_joined %>%
drop_na() %>%
initial_split()
formula1_train <- training(formula1_split)
formula1_test <- testing(formula1_split)
formula1_recipe <- recipe(win_time ~., data = formula1_train) %>%
update_role(resultId, new_role = "ID") %>%
step_dummy(name.x, nationality, constructorId) %>%
step_rm(milliseconds)
formula1_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## ID 1
## outcome 1
## predictor 8
##
## Operations:
##
## Dummy variables from name.x, nationality, constructorId
## Delete terms milliseconds
formula1_mod <- rand_forest(
mtry = 4,
trees = 100,
min_n = 3) %>%
set_mode("regression") %>%
set_engine("ranger")
if the plan is to tune hyperparameters, then use tune()
for the parameters:
formula1_mod_tune <- rand_forest(
mtry = tune(),
trees = 100,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
formula1_wf1 <- workflow() %>%
add_recipe(formula1_recipe) %>%
add_model(formula1_mod)
formula1_wf1
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_rm()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = 4
## trees = 100
## min_n = 3
##
## Computational engine: ranger
formula1_results <- formula1_wf1 %>%
fit(data = formula1_train)
formula1_results
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_dummy()
## • step_rm()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Ranger result
##
## Call:
## ranger::ranger(x = maybe_data_frame(x), y = y, mtry = min_cols(~4, x), num.trees = ~100, min.node.size = min_rows(~3, x), num.threads = 1, verbose = FALSE, seed = sample.int(10^5, 1))
##
## Type: Regression
## Number of trees: 100
## Sample size: 37
## Number of independent variables: 11
## Mtry: 4
## Target node size: 3
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 239116501
## R squared (OOB): 0.2423603
formula1_results %>% predict(new_data = formula1_train) %>% cbind(formula1_train) %>%
ggplot() +
geom_point(aes(x = win_time, y = .pred, color = name.x)) +
geom_abline(slope = 1, intercept = 0)
formula1_results %>% predict(new_data = formula1_test) %>% cbind(formula1_test) %>%
ggplot() +
geom_point(aes(x = win_time, y = .pred, color = name.x)) +
geom_abline(slope = 1, intercept = 0)
model wasn’t great. but i’m going to call this activity a win because i was able to build a pretty straightforward model that did predict the per-lap winning time to some degree. note that because we wanted a complete dataset (no missing values) with a bunch of variables, the dataset got much smaller! 37 training points and 13 test points isn’t much to do model building.
praise()
## [1] "You are wonderful!"