The data this week comes from Data.World by way of Sean Miller, Billboard.com and Spotify.
billboard <- read_csv("billboard.csv")
audio_features <- read_csv("audio_features.csv")
billboard %>%
select(song_id) %>%
distinct() %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 29389
First, we want to filter
the rows to keep only the songs of interest. We keep the songs that are on the Billboard Top 100 Chart for at least 20 weeks total.
songs <- billboard %>%
group_by(song_id) %>%
arrange(desc(week_id)) %>%
slice(1) %>%
ungroup %>%
filter(weeks_on_chart >= 20)
songs %>%
ggplot() +
geom_point(aes(x = weeks_on_chart, y = peak_position)) +
scale_y_continuous(trans = "reverse") +
ggtitle("Peak position (order reversed) by weeks on chart")
Using the top songs, we’ll join
with the audio_features
dataset to add features which we can then model to predict genre.
Note: when we do a left_join
there should be the same number of rows in the output dataframe as in the left dataframe (here songs
). In our left_join
we got more rows. Yikes!
songs_full <- songs %>% left_join(audio_features, by = "song_id")
# thanks to @nrennie at
# https://github.com/nrennie/tidytuesday/blob/main/2021/14-09-2021/14092021.R
# function for genre
select_genre <- function(word){
if (is.na(word)){
return(NA)
}
if (grepl("pop", word, fixed = TRUE)){
return("Pop")
}
if (grepl("rap", word, fixed = TRUE)){
return("Rap")
}
if (grepl("rock", word, fixed = TRUE)){
return("Rock")
}
if (grepl("country", word, fixed = TRUE)){
return("Country")
}
if (grepl("r&b", word, fixed = TRUE)){
return("R & B")
}
if (grepl("hip hop", word, fixed = TRUE)){
return("Hip Hop")
}
else {
return("Other")
}
}
main_genre = sapply(songs_full$spotify_genre, function(x) unlist(strsplit(str_sub(x, 3, -3), split = "\', \'"))[1])
songs_full$main_genre <- main_genre
genre <- unname(sapply(songs_full$main_genre, function(x) select_genre(x)))
songs_full$genre <- genre
songs_full %>%
ggplot() +
geom_point(aes(x = weeks_on_chart, y = peak_position, color = genre)) +
scale_y_continuous(trans = "reverse") +
ggtitle("Peak position (order reversed) by weeks on chart")
songs_full %>%
select(genre, main_genre, spotify_genre)
## # A tibble: 1,927 × 3
## genre main_genre spotify_genre
## <chr> <chr> <chr>
## 1 Pop dance pop ['dance pop', 'pop', 'post-teen pop']
## 2 Hip Hop east coast hip … ['east coast hip hop', 'hip hop', 'pop rap', 'rap',…
## 3 <NA> <NA> []
## 4 Pop dance pop ['dance pop', 'hip hop', 'pop', 'pop rap', 'rap', '…
## 5 Other adult standards ['adult standards', 'contemporary vocal jazz', 'lou…
## 6 Rap new jersey rap ['new jersey rap']
## 7 Country contemporary co… ['contemporary country', 'country', 'country dawn',…
## 8 Other adult standards ['adult standards', 'bubblegum pop', 'country rock'…
## 9 Pop dance pop ['dance pop', 'hip hop', 'pop', 'pop rap', 'rap', '…
## 10 Pop dance pop ['dance pop', 'hip pop', 'pop', 'pop rap', 'post-te…
## # … with 1,917 more rows
The goal of the model will be to predict genre
from:
danceability
, energy
, key
, loudness
, mode
, speechiness
, acousticness
, instrumentalness
, liveness
, valence
, tempo
, spotify_track_popularity
set.seed(4747)
songs_split <- songs_full %>%
select(song_id, week_id, genre, danceability, energy, key, loudness, mode, speechiness,
acousticness, instrumentalness, liveness, valence, tempo,
spotify_track_popularity) %>%
filter(genre != "R & B") %>%
drop_na() %>%
initial_split()
songs_train <- training(songs_split)
songs_test <- testing(songs_split)
songs_recipe <- recipe(genre ~., data = songs_train) %>%
update_role(song_id, new_role = "ID") %>%
update_role(week_id, new_role = "ID") %>%
step_normalize(all_predictors())
songs_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## ID 2
## outcome 1
## predictor 12
##
## Operations:
##
## Centering and scaling for all_predictors()
songs_mod <- nearest_neighbor(
neighbors = 7,
weight_func = "gaussian") %>%
set_mode("classification") %>%
set_engine("kknn")
if the plan is to tune hyperparameters, then use tune()
for the parameters:
songs_mod_tune <- nearest_neighbor(
neighbors = tune(),
weight_func = "gaussian") %>%
set_mode("classification") %>%
set_engine("kknn")
songs_wf1 <- workflow() %>%
add_recipe(songs_recipe) %>%
add_model(songs_mod)
songs_wf1
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = 7
## weight_func = gaussian
##
## Computational engine: kknn
songs_results <- songs_wf1 %>%
fit(data = songs_train)
songs_results
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
##
## Call:
## kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(7, data, 5), kernel = ~"gaussian")
##
## Type of response variable: nominal
## Minimal misclassification: 0.6227451
## Best kernel: gaussian
## Best k: 7
songs_results %>% predict(new_data = songs_train) %>% cbind(songs_train) %>%
select(.pred_class, genre) %>% table()
## genre
## .pred_class Country Hip Hop Other Pop Rap Rock
## Country 106 2 22 18 1 6
## Hip Hop 0 66 2 9 9 1
## Other 7 3 242 42 4 29
## Pop 23 40 67 360 22 33
## R & B 0 0 0 0 0 0
## Rap 0 1 0 4 21 0
## Rock 9 1 9 11 0 91
songs_results %>% predict(new_data = songs_train) %>% cbind(songs_train) %>%
select(.pred_class, genre) %>% table() %>% as.data.frame() %>%
ggplot(mapping = aes(y = .pred_class,
x = genre)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
scale_fill_gradient(low = "blue",
high = "red") +
ggtitle("Confusion matrix on training songs") +
labs(y = "Predicted genre",
x = "Original genre")
songs_results %>% predict(new_data = songs_test) %>% cbind(songs_test) %>%
select(.pred_class, genre) %>% table()
## genre
## .pred_class Country Hip Hop Other Pop Rap Rock
## Country 25 0 8 6 1 3
## Hip Hop 0 23 4 5 2 0
## Other 2 4 66 15 2 11
## Pop 9 11 29 116 6 9
## R & B 0 0 0 0 0 0
## Rap 0 2 0 1 9 0
## Rock 3 0 1 9 0 39
songs_results %>% predict(new_data = songs_test) %>% cbind(songs_test) %>%
select(.pred_class, genre) %>% table() %>% as.data.frame() %>%
ggplot(mapping = aes(y = .pred_class,
x = genre)) +
geom_tile(aes(fill = Freq)) +
geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
scale_fill_gradient(low = "blue",
high = "red") +
ggtitle("Confusion matrix on test songs") +
labs(y = "Predicted genre",
x = "Original genre")
First, a few extra tidbits we need for finding the best value of k
.
songs_grid = grid_regular(neighbors(), levels = 5)
set.seed(47)
songs_folds <- vfold_cv(songs_train)
songs_wf2 <- workflow() %>%
add_recipe(songs_recipe) %>%
add_model(songs_mod_tune)
songs_wf2
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
##
## Main Arguments:
## neighbors = tune()
## weight_func = gaussian
##
## Computational engine: kknn
songs_results2 <- songs_wf2 %>%
tune_grid(
resamples = songs_folds,
grid = songs_grid
)
songs_results2
## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [1134/127]> Fold01 <tibble [10 × 5]> <tibble [0 × 1]>
## 2 <split [1135/126]> Fold02 <tibble [10 × 5]> <tibble [0 × 1]>
## 3 <split [1135/126]> Fold03 <tibble [10 × 5]> <tibble [0 × 1]>
## 4 <split [1135/126]> Fold04 <tibble [10 × 5]> <tibble [0 × 1]>
## 5 <split [1135/126]> Fold05 <tibble [10 × 5]> <tibble [0 × 1]>
## 6 <split [1135/126]> Fold06 <tibble [10 × 5]> <tibble [0 × 1]>
## 7 <split [1135/126]> Fold07 <tibble [10 × 5]> <tibble [0 × 1]>
## 8 <split [1135/126]> Fold08 <tibble [10 × 5]> <tibble [0 × 1]>
## 9 <split [1135/126]> Fold09 <tibble [10 × 5]> <tibble [0 × 1]>
## 10 <split [1135/126]> Fold10 <tibble [10 × 5]> <tibble [0 × 1]>
songs_results2 %>%
collect_metrics()
## # A tibble: 10 × 7
## neighbors .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 1 accuracy multiclass 0.358 10 0.0148 Preprocessor1_Model1
## 2 1 roc_auc hand_till 0.595 10 0.0111 Preprocessor1_Model1
## 3 3 accuracy multiclass 0.351 10 0.0170 Preprocessor1_Model2
## 4 3 roc_auc hand_till 0.663 10 0.0108 Preprocessor1_Model2
## 5 5 accuracy multiclass 0.385 10 0.0146 Preprocessor1_Model3
## 6 5 roc_auc hand_till 0.704 10 0.0111 Preprocessor1_Model3
## 7 7 accuracy multiclass 0.404 10 0.0142 Preprocessor1_Model4
## 8 7 roc_auc hand_till 0.726 10 0.00968 Preprocessor1_Model4
## 9 10 accuracy multiclass 0.411 10 0.0123 Preprocessor1_Model5
## 10 10 roc_auc hand_till 0.739 10 0.00774 Preprocessor1_Model5
songs_results2 %>%
collect_metrics() %>%
ggplot(aes(x = neighbors, y = mean)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number())
songs_results2 %>%
show_best("roc_auc")
## # A tibble: 5 × 7
## neighbors .metric .estimator mean n std_err .config
## <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 10 roc_auc hand_till 0.739 10 0.00774 Preprocessor1_Model5
## 2 7 roc_auc hand_till 0.726 10 0.00968 Preprocessor1_Model4
## 3 5 roc_auc hand_till 0.704 10 0.0111 Preprocessor1_Model3
## 4 3 roc_auc hand_till 0.663 10 0.0108 Preprocessor1_Model2
## 5 1 roc_auc hand_till 0.595 10 0.0111 Preprocessor1_Model1
songs_best <- songs_results2 %>%
select_best("accuracy")
songs_best
## # A tibble: 1 × 2
## neighbors .config
## <int> <chr>
## 1 10 Preprocessor1_Model5
songs_wf3 <- songs_wf2 %>%
finalize_workflow(songs_best)
songs_fit3 <- songs_wf3 %>%
last_fit(songs_split)
songs_fit3 %>%
collect_metrics
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy multiclass 0.447 Preprocessor1_Model1
## 2 roc_auc hand_till 0.743 Preprocessor1_Model1
Default way to make ROC curve for each of the music genres. So cool!!!
songs_fit3 %>%
collect_predictions() %>%
yardstick::roc_curve(genre, .pred_Country:.pred_Rock) %>%
autoplot()
praise()
## [1] "You are funkadelic!"