Classification of Penguins using tidymodels

What you'll learn

how to install tidymodels package and the palmerpenguins dataset
how to plot and investigate the data
how to set up a tidymodels workflow, recipe and model
evaluating and tuning your model

For this codelab, you will need to install the tidymodels and palmerpenguins packages.

The tidymodels meta-package is actually a collection of packages for machine learning, that are designed to work together well.

The palmerpenguins package contains a dataset of observed penguin species and their various physical characteristics. The aim of this codelab is to build a model to predict the penguin;s species based on these recorded physical characteristics.

To install the required packages, using the following code in the R console.

install.packages(c("tidymodels", "palmerpenguins"))

Hello

library(tidymodels, quietly = T)

## Registered S3 method overwritten by 'tune':
##   method                   from   
##   required_pkgs.model_spec parsnip

## ── Attaching packages ────────────────────────────────────── tidymodels 0.1.3 ──

## ✓ broom        0.7.8      ✓ recipes      0.1.16
## ✓ dials        0.0.9      ✓ rsample      0.1.0 
## ✓ dplyr        1.0.7      ✓ tibble       3.1.2 
## ✓ ggplot2      3.3.5      ✓ tidyr        1.1.3 
## ✓ infer        0.5.4      ✓ tune         0.1.5 
## ✓ modeldata    0.1.0      ✓ workflows    0.2.2 
## ✓ parsnip      0.1.6      ✓ workflowsets 0.0.2 
## ✓ purrr        0.3.4      ✓ yardstick    0.0.8

## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.

library(palmerpenguins, quietly = T)

df_split <- initial_split(penguins, strata = species)

print(df_split)

## <Analysis/Assess/Total>
## <258/86/344>

df_analysis <- df_split %>% analysis()
df_assess <- df_split %>% assessment()

penguins %>% 
  summarise(across(everything(), ~sum(is.na(.x))))

## # A tibble: 1 x 8
##   species island bill_length_mm bill_depth_mm flipper_length_... body_mass_g   sex
##     <int>  <int>          <int>         <int>            <int>       <int> <int>
## 1       0      0              2             2                2           2    11
## # ... with 1 more variable: year <int>

rcp <- 
  recipe(species ~ ., data = df_analysis)

rcp <- 
  recipe(species ~ ., data = df_analysis) %>% 
  step_impute_mean(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors())

rcp %>% prep() %>% bake(df_analysis)

## # A tibble: 258 x 8
##    island  bill_length_mm bill_depth_mm flipper_length_... body_mass_g sex    year
##    <fct>            <dbl>         <dbl>            <int>       <int> <fct> <int>
##  1 Torger...           39.1          18.7              181        3750 male   2007
##  2 Torger...           40.3          18                195        3250 fema...  2007
##  3 Torger...           44.0          17.2              201        4212 male   2007
##  4 Torger...           39.3          20.6              190        3650 male   2007
##  5 Torger...           38.9          17.8              181        3625 fema...  2007
##  6 Torger...           39.2          19.6              195        4675 male   2007
##  7 Torger...           34.1          18.1              193        3475 male   2007
##  8 Torger...           42            20.2              190        4250 male   2007
##  9 Torger...           37.8          17.1              186        3300 male   2007
## 10 Torger...           37.8          17.3              180        3700 male   2007
## # ... with 248 more rows, and 1 more variable: species <fct>

classifier <- 
  decision_tree(mode = "classification") %>% 
  set_engine("rpart")

wf <-
  workflow() %>% 
  add_recipe(rcp) %>% 
  add_model(classifier)

model <- fit(wf, data = df_analysis)

model

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_impute_mean()
## • step_impute_mode()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## n= 258 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 258 144 Adelie (0.441860465 0.197674419 0.360465116)  
##   2) flipper_length_mm< 207.5 163  49 Adelie (0.699386503 0.294478528 0.006134969)  
##     4) bill_length_mm< 44.65 116   5 Adelie (0.956896552 0.043103448 0.000000000) *
##     5) bill_length_mm>=44.65 47   4 Chinstrap (0.063829787 0.914893617 0.021276596) *
##   3) flipper_length_mm>=207.5 95   3 Gentoo (0.000000000 0.031578947 0.968421053) *

df_pred <-
  bind_cols(
    df_assess,
    predict(model, new_data = df_assess)
    )

df_pred %>% 
  accuracy(truth = species, estimate = .pred_class)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.930

df_pred %>% 
  conf_mat(truth = species, estimate = .pred_class)

##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        36         1      1
##   Chinstrap      0        15      1
##   Gentoo         2         1     29

Better Preprocessing

rcp <- 
  recipe(species ~ ., data = df_analysis) %>% 
  step_impute_mean(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  themis::step_upsample(species)

## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes

A Different Model

classifier <- rand_forest(mode = "classification")

wf <- 
  workflow() %>% 
  add_recipe(rcp) %>% 
  add_model(classifier)

model <- fit(wf, data = df_analysis)

## Warning: Engine set to `ranger`.

Assess

df_pred <-
  bind_cols(
    df_assess,
    predict(model, new_data = df_assess)
    )

df_pred %>% 
  accuracy(truth = species, estimate = .pred_class)

## # A tibble: 1 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy multiclass     0.988

df_pred %>% 
  conf_mat(truth = species, estimate = .pred_class)

##            Truth
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        38         0      1
##   Chinstrap      0        17      0
##   Gentoo         0         0     30