This pages holds the details for the recipe preprocessing blueprint. This
is the blueprint used by default from mold()
if x
is a recipe.
default_recipe_blueprint( intercept = FALSE, allow_novel_levels = FALSE, fresh = TRUE, composition = "tibble" ) # S3 method for recipe mold(x, data, ..., blueprint = NULL)
intercept | A logical. Should an intercept be included in the
processed data? This information is used by the |
---|---|
allow_novel_levels | A logical. Should novel factor levels be allowed at
prediction time? This information is used by the |
fresh | Should already trained operations be re-trained when |
composition | Either "tibble", "matrix", or "dgCMatrix" for the format of the processed predictors. If "matrix" or "dgCMatrix" are chosen, all of the predictors must be numeric after the preprocessing method has been applied; otherwise an error is thrown. |
x | An unprepped recipe created from |
data | A data frame or matrix containing the outcomes and predictors. |
... | Not used. |
blueprint | A preprocessing |
For default_recipe_blueprint()
, a recipe blueprint.
When mold()
is used with the default recipe blueprint:
It calls recipes::prep()
to prep the recipe.
It calls recipes::juice()
to extract the outcomes and predictors. These
are returned as tibbles.
If intercept = TRUE
, adds an intercept column to the predictors.
When forge()
is used with the default recipe blueprint:
It calls shrink()
to trim new_data
to only the required columns and
coerce new_data
to a tibble.
It calls scream()
to perform validation on the structure of the columns
of new_data
.
It calls recipes::bake()
on the new_data
using the prepped recipe
used during training.
It adds an intercept column onto new_data
if intercept = TRUE
.
#>#> #>#>#> #>#>#> #>#> #>#>#> #># --------------------------------------------------------------------------- # Setup train <- iris[1:100,] test <- iris[101:150,] # --------------------------------------------------------------------------- # Recipes example # Create a recipe that logs a predictor rec <- recipe(Species ~ Sepal.Length + Sepal.Width, train) %>% step_log(Sepal.Length) processed <- mold(rec, train) # Sepal.Length has been logged processed$predictors#> # A tibble: 100 x 2 #> Sepal.Length Sepal.Width #> <dbl> <dbl> #> 1 1.63 3.5 #> 2 1.59 3 #> 3 1.55 3.2 #> 4 1.53 3.1 #> 5 1.61 3.6 #> 6 1.69 3.9 #> 7 1.53 3.4 #> 8 1.61 3.4 #> 9 1.48 2.9 #> 10 1.59 3.1 #> # … with 90 more rowsprocessed$outcomes#> # A tibble: 100 x 1 #> Species #> <fct> #> 1 setosa #> 2 setosa #> 3 setosa #> 4 setosa #> 5 setosa #> 6 setosa #> 7 setosa #> 8 setosa #> 9 setosa #> 10 setosa #> # … with 90 more rows# The underlying blueprint is a prepped recipe processed$blueprint$recipe#> Data Recipe #> #> Inputs: #> #> role #variables #> outcome 1 #> predictor 2 #> #> Training data contained 100 data points and no missing data. #> #> Operations: #> #> Log transformation on Sepal.Length [trained]# Call forge() with the blueprint and the test data # to have it preprocess the test data in the same way forge(test, processed$blueprint)#> $predictors #> # A tibble: 50 x 2 #> Sepal.Length Sepal.Width #> <dbl> <dbl> #> 1 1.84 3.3 #> 2 1.76 2.7 #> 3 1.96 3 #> 4 1.84 2.9 #> 5 1.87 3 #> 6 2.03 3 #> 7 1.59 2.5 #> 8 1.99 2.9 #> 9 1.90 2.5 #> 10 1.97 3.6 #> # … with 40 more rows #> #> $outcomes #> NULL #> #> $extras #> $extras$roles #> NULL #> #># Use `outcomes = TRUE` to also extract the preprocessed outcome! # This logged the Sepal.Length column of `new_data` forge(test, processed$blueprint, outcomes = TRUE)#> $predictors #> # A tibble: 50 x 2 #> Sepal.Length Sepal.Width #> <dbl> <dbl> #> 1 1.84 3.3 #> 2 1.76 2.7 #> 3 1.96 3 #> 4 1.84 2.9 #> 5 1.87 3 #> 6 2.03 3 #> 7 1.59 2.5 #> 8 1.99 2.9 #> 9 1.90 2.5 #> 10 1.97 3.6 #> # … with 40 more rows #> #> $outcomes #> # A tibble: 50 x 1 #> Species #> <fct> #> 1 virginica #> 2 virginica #> 3 virginica #> 4 virginica #> 5 virginica #> 6 virginica #> 7 virginica #> 8 virginica #> 9 virginica #> 10 virginica #> # … with 40 more rows #> #> $extras #> $extras$roles #> NULL #> #># --------------------------------------------------------------------------- # With an intercept # You can add an intercept with `intercept = TRUE` processed <- mold(rec, train, blueprint = default_recipe_blueprint(intercept = TRUE)) processed$predictors#> # A tibble: 100 x 3 #> `(Intercept)` Sepal.Length Sepal.Width #> <int> <dbl> <dbl> #> 1 1 1.63 3.5 #> 2 1 1.59 3 #> 3 1 1.55 3.2 #> 4 1 1.53 3.1 #> 5 1 1.61 3.6 #> 6 1 1.69 3.9 #> 7 1 1.53 3.4 #> 8 1 1.61 3.4 #> 9 1 1.48 2.9 #> 10 1 1.59 3.1 #> # … with 90 more rows# But you also could have used a recipe step rec2 <- step_intercept(rec) mold(rec2, iris)$predictors#> # A tibble: 150 x 3 #> intercept Sepal.Length Sepal.Width #> <dbl> <dbl> <dbl> #> 1 1 1.63 3.5 #> 2 1 1.59 3 #> 3 1 1.55 3.2 #> 4 1 1.53 3.1 #> 5 1 1.61 3.6 #> 6 1 1.69 3.9 #> 7 1 1.53 3.4 #> 8 1 1.61 3.4 #> 9 1 1.48 2.9 #> 10 1 1.59 3.1 #> # … with 140 more rows# --------------------------------------------------------------------------- # Non standard roles # If you have custom recipe roles, they are processed and returned in # the `$extras$roles` slot of the return value of `mold()` and `forge()`. rec_roles <- recipe(train) %>% update_role(Sepal.Width, new_role = "predictor") %>% update_role(Species, new_role = "outcome") %>% update_role(Sepal.Length, new_role = "custom_role") %>% update_role(Petal.Length, new_role = "custom_role2") processed_roles <- mold(rec_roles, train) processed_roles$extras#> $roles #> $roles$custom_role #> # A tibble: 100 x 1 #> Sepal.Length #> <dbl> #> 1 5.1 #> 2 4.9 #> 3 4.7 #> 4 4.6 #> 5 5 #> 6 5.4 #> 7 4.6 #> 8 5 #> 9 4.4 #> 10 4.9 #> # … with 90 more rows #> #> $roles$custom_role2 #> # A tibble: 100 x 1 #> Petal.Length #> <dbl> #> 1 1.4 #> 2 1.4 #> 3 1.3 #> 4 1.5 #> 5 1.4 #> 6 1.7 #> 7 1.4 #> 8 1.5 #> 9 1.4 #> 10 1.5 #> # … with 90 more rows #> #>#> $predictors #> # A tibble: 50 x 1 #> Sepal.Width #> <dbl> #> 1 3.3 #> 2 2.7 #> 3 3 #> 4 2.9 #> 5 3 #> 6 3 #> 7 2.5 #> 8 2.9 #> 9 2.5 #> 10 3.6 #> # … with 40 more rows #> #> $outcomes #> NULL #> #> $extras #> $extras$roles #> $extras$roles$custom_role #> # A tibble: 50 x 1 #> Sepal.Length #> <dbl> #> 1 6.3 #> 2 5.8 #> 3 7.1 #> 4 6.3 #> 5 6.5 #> 6 7.6 #> 7 4.9 #> 8 7.3 #> 9 6.7 #> 10 7.2 #> # … with 40 more rows #> #> $extras$roles$custom_role2 #> # A tibble: 50 x 1 #> Petal.Length #> <dbl> #> 1 6 #> 2 5.1 #> 3 5.9 #> 4 5.6 #> 5 5.8 #> 6 6.6 #> 7 4.5 #> 8 6.3 #> 9 5.8 #> 10 6.1 #> # … with 40 more rows #> #> #># --------------------------------------------------------------------------- # Matrix output for predictors # You can change the `composition` of the predictor data set bp <- default_recipe_blueprint(composition = "dgCMatrix") processed <- mold(rec, train, blueprint = bp) class(processed$predictors)#> [1] "dgCMatrix" #> attr(,"package") #> [1] "Matrix"