The Data

The data this week comes from Data.World by way of Sean Miller, Billboard.com and Spotify.

billboard <- read_csv("billboard.csv")
audio_features <- read_csv("audio_features.csv")
billboard %>%
  select(song_id) %>%
  distinct() %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1 29389

Finding the songs of interest

First, we want to filter the rows to keep only the songs of interest. We keep the songs that are on the Billboard Top 100 Chart for at least 20 weeks total.

songs <- billboard %>%
  group_by(song_id) %>%
  arrange(desc(week_id)) %>%
  slice(1) %>%
  ungroup %>%
  filter(weeks_on_chart >= 20) 
songs %>%
  ggplot() + 
  geom_point(aes(x = weeks_on_chart, y = peak_position)) +
  scale_y_continuous(trans = "reverse") +
  ggtitle("Peak position (order reversed) by weeks on chart")

Using the top songs, we’ll join with the audio_features dataset to add features which we can then model to predict genre.

Note: when we do a left_join there should be the same number of rows in the output dataframe as in the left dataframe (here songs). In our left_join we got more rows. Yikes!

songs_full <- songs %>% left_join(audio_features, by = "song_id")
# thanks to @nrennie at
# https://github.com/nrennie/tidytuesday/blob/main/2021/14-09-2021/14092021.R
# function for genre
select_genre <- function(word){
  if (is.na(word)){
    return(NA)
  }
  if (grepl("pop", word, fixed = TRUE)){
    return("Pop")
  }
  if (grepl("rap", word, fixed = TRUE)){
    return("Rap")
  }
  if (grepl("rock", word, fixed = TRUE)){
    return("Rock")
  }
  if (grepl("country", word, fixed = TRUE)){
    return("Country")
  }
  if (grepl("r&b", word, fixed = TRUE)){
    return("R & B")
  }
  if (grepl("hip hop", word, fixed = TRUE)){
    return("Hip Hop")
  }
  else {
    return("Other")
  }
}
main_genre = sapply(songs_full$spotify_genre, function(x) unlist(strsplit(str_sub(x, 3, -3), split = "\', \'"))[1])
songs_full$main_genre <- main_genre
genre <- unname(sapply(songs_full$main_genre, function(x) select_genre(x)))
songs_full$genre <- genre
songs_full %>%
  ggplot() + 
  geom_point(aes(x = weeks_on_chart, y = peak_position, color = genre)) +
  scale_y_continuous(trans = "reverse") +
  ggtitle("Peak position (order reversed) by weeks on chart")

songs_full %>%
  select(genre, main_genre, spotify_genre)
## # A tibble: 1,927 × 3
##    genre   main_genre       spotify_genre                                       
##    <chr>   <chr>            <chr>                                               
##  1 Pop     dance pop        ['dance pop', 'pop', 'post-teen pop']               
##  2 Hip Hop east coast hip … ['east coast hip hop', 'hip hop', 'pop rap', 'rap',…
##  3 <NA>    <NA>             []                                                  
##  4 Pop     dance pop        ['dance pop', 'hip hop', 'pop', 'pop rap', 'rap', '…
##  5 Other   adult standards  ['adult standards', 'contemporary vocal jazz', 'lou…
##  6 Rap     new jersey rap   ['new jersey rap']                                  
##  7 Country contemporary co… ['contemporary country', 'country', 'country dawn',…
##  8 Other   adult standards  ['adult standards', 'bubblegum pop', 'country rock'…
##  9 Pop     dance pop        ['dance pop', 'hip hop', 'pop', 'pop rap', 'rap', '…
## 10 Pop     dance pop        ['dance pop', 'hip pop', 'pop', 'pop rap', 'post-te…
## # … with 1,917 more rows

Building a k-nn classifier

The goal of the model will be to predict genre from:

danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, spotify_track_popularity

data split

set.seed(4747)
songs_split <- songs_full %>%
  select(song_id, week_id, genre, danceability, energy, key, loudness, mode, speechiness,
         acousticness, instrumentalness, liveness, valence, tempo, 
         spotify_track_popularity) %>%
  filter(genre != "R & B") %>%
  drop_na() %>%
  initial_split()

songs_train <- training(songs_split)
songs_test <- testing(songs_split)

recipe

songs_recipe <- recipe(genre ~., data = songs_train) %>%
  update_role(song_id, new_role = "ID") %>%
  update_role(week_id, new_role = "ID") %>%
  step_normalize(all_predictors())

songs_recipe
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          2
##    outcome          1
##  predictor         12
## 
## Operations:
## 
## Centering and scaling for all_predictors()

set a model

songs_mod <- nearest_neighbor(
  neighbors = 7,
  weight_func = "gaussian") %>%
  set_mode("classification") %>%
  set_engine("kknn")

alternative model setting

if the plan is to tune hyperparameters, then use tune() for the parameters:

songs_mod_tune <- nearest_neighbor(
  neighbors = tune(),
  weight_func = "gaussian") %>%
  set_mode("classification") %>%
  set_engine("kknn")

build workflow

songs_wf1 <- workflow() %>%
  add_recipe(songs_recipe) %>%
  add_model(songs_mod)

songs_wf1
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = 7
##   weight_func = gaussian
## 
## Computational engine: kknn

fit model

songs_results <- songs_wf1 %>%
  fit(data = songs_train)

songs_results
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## 
## Call:
## kknn::train.kknn(formula = ..y ~ ., data = data, ks = min_rows(7,     data, 5), kernel = ~"gaussian")
## 
## Type of response variable: nominal
## Minimal misclassification: 0.6227451
## Best kernel: gaussian
## Best k: 7
songs_results %>% predict(new_data = songs_train) %>% cbind(songs_train) %>%
  select(.pred_class, genre) %>% table()
##            genre
## .pred_class Country Hip Hop Other Pop Rap Rock
##     Country     106       2    22  18   1    6
##     Hip Hop       0      66     2   9   9    1
##     Other         7       3   242  42   4   29
##     Pop          23      40    67 360  22   33
##     R & B         0       0     0   0   0    0
##     Rap           0       1     0   4  21    0
##     Rock          9       1     9  11   0   91
songs_results %>% predict(new_data = songs_train) %>% cbind(songs_train) %>%
  select(.pred_class, genre) %>% table() %>% as.data.frame() %>%
  ggplot(mapping = aes(y = .pred_class,
                     x = genre)) +
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "blue",
                      high = "red") +
  ggtitle("Confusion matrix on training songs") +
  labs(y = "Predicted genre",
       x = "Original genre")

heat map to show the confusion matrix comparing the actual genre and the predicted genre for the training data.  the model does quite well with the vast majority of the observations being correctly predicted.  the most likely incorrect prediction is for pop music.

songs_results %>% predict(new_data = songs_test) %>% cbind(songs_test) %>%
  select(.pred_class, genre) %>% table()
##            genre
## .pred_class Country Hip Hop Other Pop Rap Rock
##     Country      25       0     8   6   1    3
##     Hip Hop       0      23     4   5   2    0
##     Other         2       4    66  15   2   11
##     Pop           9      11    29 116   6    9
##     R & B         0       0     0   0   0    0
##     Rap           0       2     0   1   9    0
##     Rock          3       0     1   9   0   39
songs_results %>% predict(new_data = songs_test) %>% cbind(songs_test) %>%
  select(.pred_class, genre) %>% table() %>% as.data.frame() %>%
  ggplot(mapping = aes(y = .pred_class,
                     x = genre)) +
  geom_tile(aes(fill = Freq)) +
  geom_text(aes(label = sprintf("%1.0f", Freq)), vjust = 1) +
  scale_fill_gradient(low = "blue",
                      high = "red") +
  ggtitle("Confusion matrix on test songs") +
  labs(y = "Predicted genre",
       x = "Original genre")

heat map to show the confusion matrix comparing the actual genre and the predicted genre for the test data.  the model less well on the test data, but still, much better than chance.  pop and other get predicted for one another.  the most likely incorrect prediction is for pop music.

Tune the model!

build workflow

First, a few extra tidbits we need for finding the best value of k.

songs_grid = grid_regular(neighbors(), levels = 5)

set.seed(47)
songs_folds <- vfold_cv(songs_train)
songs_wf2 <- workflow() %>%
  add_recipe(songs_recipe) %>%
  add_model(songs_mod_tune)

songs_wf2
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: nearest_neighbor()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## K-Nearest Neighbor Model Specification (classification)
## 
## Main Arguments:
##   neighbors = tune()
##   weight_func = gaussian
## 
## Computational engine: kknn
songs_results2 <- songs_wf2 %>%
  tune_grid(
    resamples = songs_folds,
    grid = songs_grid
  )

songs_results2
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits             id     .metrics          .notes          
##    <list>             <chr>  <list>            <list>          
##  1 <split [1134/127]> Fold01 <tibble [10 × 5]> <tibble [0 × 1]>
##  2 <split [1135/126]> Fold02 <tibble [10 × 5]> <tibble [0 × 1]>
##  3 <split [1135/126]> Fold03 <tibble [10 × 5]> <tibble [0 × 1]>
##  4 <split [1135/126]> Fold04 <tibble [10 × 5]> <tibble [0 × 1]>
##  5 <split [1135/126]> Fold05 <tibble [10 × 5]> <tibble [0 × 1]>
##  6 <split [1135/126]> Fold06 <tibble [10 × 5]> <tibble [0 × 1]>
##  7 <split [1135/126]> Fold07 <tibble [10 × 5]> <tibble [0 × 1]>
##  8 <split [1135/126]> Fold08 <tibble [10 × 5]> <tibble [0 × 1]>
##  9 <split [1135/126]> Fold09 <tibble [10 × 5]> <tibble [0 × 1]>
## 10 <split [1135/126]> Fold10 <tibble [10 × 5]> <tibble [0 × 1]>

collect metrics

songs_results2 %>%
  collect_metrics()
## # A tibble: 10 × 7
##    neighbors .metric  .estimator  mean     n std_err .config             
##        <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>               
##  1         1 accuracy multiclass 0.358    10 0.0148  Preprocessor1_Model1
##  2         1 roc_auc  hand_till  0.595    10 0.0111  Preprocessor1_Model1
##  3         3 accuracy multiclass 0.351    10 0.0170  Preprocessor1_Model2
##  4         3 roc_auc  hand_till  0.663    10 0.0108  Preprocessor1_Model2
##  5         5 accuracy multiclass 0.385    10 0.0146  Preprocessor1_Model3
##  6         5 roc_auc  hand_till  0.704    10 0.0111  Preprocessor1_Model3
##  7         7 accuracy multiclass 0.404    10 0.0142  Preprocessor1_Model4
##  8         7 roc_auc  hand_till  0.726    10 0.00968 Preprocessor1_Model4
##  9        10 accuracy multiclass 0.411    10 0.0123  Preprocessor1_Model5
## 10        10 roc_auc  hand_till  0.739    10 0.00774 Preprocessor1_Model5
songs_results2 %>%
  collect_metrics() %>%
  ggplot(aes(x = neighbors, y = mean)) +
  geom_line(size = 1.5, alpha = 0.6) +
  geom_point(size = 2) +
  facet_wrap(~ .metric, scales = "free", nrow = 2) +
  scale_x_log10(labels = scales::label_number()) 

songs_results2 %>%
  show_best("roc_auc")
## # A tibble: 5 × 7
##   neighbors .metric .estimator  mean     n std_err .config             
##       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1        10 roc_auc hand_till  0.739    10 0.00774 Preprocessor1_Model5
## 2         7 roc_auc hand_till  0.726    10 0.00968 Preprocessor1_Model4
## 3         5 roc_auc hand_till  0.704    10 0.0111  Preprocessor1_Model3
## 4         3 roc_auc hand_till  0.663    10 0.0108  Preprocessor1_Model2
## 5         1 roc_auc hand_till  0.595    10 0.0111  Preprocessor1_Model1

fit model

songs_best <- songs_results2 %>%
  select_best("accuracy")

songs_best
## # A tibble: 1 × 2
##   neighbors .config             
##       <int> <chr>               
## 1        10 Preprocessor1_Model5
songs_wf3 <- songs_wf2 %>%
  finalize_workflow(songs_best)

songs_fit3 <- songs_wf3 %>%
  last_fit(songs_split)

songs_fit3 %>%
  collect_metrics
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy multiclass     0.447 Preprocessor1_Model1
## 2 roc_auc  hand_till      0.743 Preprocessor1_Model1

Default way to make ROC curve for each of the music genres. So cool!!!

songs_fit3 %>%
  collect_predictions() %>%
  yardstick::roc_curve(genre, .pred_Country:.pred_Rock) %>%
  autoplot()

a separate ROC curve for each genre, where the sensitivity and specificity are calculated for each group on a correct / incorrect basis.  Hip hop has the best curve, pop's curve is the worst (not much better than random guessing).

praise()
## [1] "You are funkadelic!"