What you'll learn

For this codelab, you will need to install the tidymodels and palmerpenguins packages.

The tidymodels meta-package is actually a collection of packages for machine learning, that are designed to work together well.

The palmerpenguins package contains a dataset of observed penguin species and their various physical characteristics. The aim of this codelab is to build a model to predict the penguin;s species based on these recorded physical characteristics.

To install the required packages, using the following code in the R console.

install.packages(c("tidymodels", "palmerpenguins"))


library(tidymodels, quietly = T)
library(palmerpenguins, quietly = T)
df_split <- initial_split(penguins, strata = species)

## <Analysis/Assess/Total>
## <258/86/344>
df_analysis <- df_split %>% analysis()
df_assess <- df_split %>% assessment()
penguins %>% 
  summarise(across(everything(), ~sum(is.na(.x))))
## # A tibble: 1 x 8
##   species island bill_length_mm bill_depth_mm flipper_length_... body_mass_g   sex
##     <int>  <int>          <int>         <int>            <int>       <int> <int>
## 1       0      0              2             2                2           2    11
## # ... with 1 more variable: year <int>
rcp <- 
  recipe(species ~ ., data = df_analysis)
rcp <- 
  recipe(species ~ ., data = df_analysis) %>% 
  step_impute_mean(all_numeric_predictors()) %>% 
rcp %>% prep() %>% bake(df_analysis)
## # A tibble: 258 x 8
##    island  bill_length_mm bill_depth_mm flipper_length_... body_mass_g sex    year
##    <fct>            <dbl>         <dbl>            <int>       <int> <fct> <int>
##  1 Torger...           39.1          18.7              181        3750 male   2007
##  2 Torger...           40.3          18                195        3250 fema...  2007
##  3 Torger...           44.0          17.2              201        4212 male   2007
##  4 Torger...           39.3          20.6              190        3650 male   2007
##  5 Torger...           38.9          17.8              181        3625 fema...  2007
##  6 Torger...           39.2          19.6              195        4675 male   2007
##  7 Torger...           34.1          18.1              193        3475 male   2007
##  8 Torger...           42            20.2              190        4250 male   2007
##  9 Torger...           37.8          17.1              186        3300 male   2007
## 10 Torger...           37.8          17.3              180        3700 male   2007
## # ... with 248 more rows, and 1 more variable: species <fct>
classifier <- 
  decision_tree(mode = "classification") %>% 
wf <-
  workflow() %>% 
  add_recipe(rcp) %>% 
model <- fit(wf, data = df_analysis)
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## • step_impute_mean()
## • step_impute_mode()
## ── Model ───────────────────────────────────────────────────────────────────────
## n= 258 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 1) root 258 144 Adelie (0.441860465 0.197674419 0.360465116)  
##   2) flipper_length_mm< 207.5 163  49 Adelie (0.699386503 0.294478528 0.006134969)  
##     4) bill_length_mm< 44.65 116   5 Adelie (0.956896552 0.043103448 0.000000000) *
##     5) bill_length_mm>=44.65 47   4 Chinstrap (0.063829787 0.914893617 0.021276596) *
##   3) flipper_length_mm>=207.5 95   3 Gentoo (0.000000000 0.031578947 0.968421053) *
df_pred <-
    predict(model, new_data = df_assess)
df_pred %>% 
  accuracy(truth = species, estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.930
df_pred %>% 
  conf_mat(truth = species, estimate = .pred_class)
##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        36         1      1
##   Chinstrap      0        15      1
##   Gentoo         2         1     29

Better Preprocessing

rcp <- 
  recipe(species ~ ., data = df_analysis) %>% 
  step_impute_mean(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
A Different Model

classifier <- rand_forest(mode = "classification")
wf <- 
  workflow() %>% 
  add_recipe(rcp) %>% 

model <- fit(wf, data = df_analysis)
df_pred <-
    predict(model, new_data = df_assess)

df_pred %>% 
  accuracy(truth = species, estimate = .pred_class)
## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.988
df_pred %>% 
  conf_mat(truth = species, estimate = .pred_class)
##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        38         0      1
##   Chinstrap      0        17      0
##   Gentoo         0         0     30