First of all, thank you for using healthyR.ai
. If you
encounter issues or want to make a feature request, please visit https://github.com/spsanderson/healthyR.ai/issues
library(healthyR.ai)
In this should example we will showcase the
pca_your_recipe()
function. This function takes only a few
arguments. The arguments are currently .data
which is the
full data set that gets passed internally to the
recipes::bake()
function, .recipe_object
which
is a recipe you have already made and want to pass to the function in
order to perform the pca, and finally .threshold
which is
the fraction of the variance that should be captured by the
components.
To start this walk through we will first load in a few libraries.
library(timetk)
library(dplyr)
library(purrr)
library(healthyR.data)
library(rsample)
library(recipes)
library(ggplot2)
library(plotly)
Now that we have out libraries we can go ahead and get our data set ready.
<- healthyR_data %>%
data_tbl select(visit_end_date_time) %>%
summarise_by_time(
.date_var = visit_end_date_time,
.by = "month",
value = n()
%>%
) set_names("date_col","value") %>%
filter_by_time(
.date_var = date_col,
.start_date = "2013",
.end_date = "2020"
%>%
) mutate(date_col = as.Date(date_col))
head(data_tbl)
#> # A tibble: 6 × 2
#> date_col value
#> <date> <int>
#> 1 2013-01-01 2082
#> 2 2013-02-01 1719
#> 3 2013-03-01 1796
#> 4 2013-04-01 1865
#> 5 2013-05-01 2028
#> 6 2013-06-01 1813
The data set is simple and by itself would not be at all useful for a
pca analysis since there is only one predictor, being time. In order to
facilitate the use of the function and this example, we will create a
splits
object and a recipe
object.
<- initial_split(data = data_tbl, prop = 0.8)
splits
splits#> <Training/Testing/Total>
#> <76/19/95>
head(training(splits))
#> # A tibble: 6 × 2
#> date_col value
#> <date> <int>
#> 1 2017-06-01 1661
#> 2 2017-10-01 1614
#> 3 2019-04-01 1443
#> 4 2018-03-01 1618
#> 5 2013-11-01 1669
#> 6 2019-11-01 1302
<- recipe(value ~ ., training(splits)) %>%
rec_obj step_timeseries_signature(date_col) %>%
step_rm(matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)"))
rec_obj#>
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 1
#>
#> ── Operations
#> • Timeseries signature features from: date_col
#> • Variables removed: matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
get_juiced_data(rec_obj) %>% glimpse()
#> Rows: 76
#> Columns: 20
#> $ date_col <date> 2017-06-01, 2017-10-01, 2019-04-01, 2018-03-01, 20…
#> $ value <int> 1661, 1614, 1443, 1618, 1669, 1302, 1782, 997, 1544…
#> $ date_col_index.num <dbl> 1496275200, 1506816000, 1554076800, 1519862400, 138…
#> $ date_col_year <int> 2017, 2017, 2019, 2018, 2013, 2019, 2014, 2020, 201…
#> $ date_col_half <int> 1, 2, 1, 1, 2, 2, 2, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, …
#> $ date_col_quarter <int> 2, 4, 2, 1, 4, 4, 3, 2, 2, 3, 1, 3, 1, 2, 2, 2, 2, …
#> $ date_col_month <int> 6, 10, 4, 3, 11, 11, 7, 6, 5, 9, 2, 8, 2, 5, 6, 4, …
#> $ date_col_month.lbl <ord> June, October, April, March, November, November, Ju…
#> $ date_col_day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ date_col_wday <int> 5, 1, 2, 5, 6, 6, 3, 2, 6, 3, 1, 7, 4, 6, 1, 3, 1, …
#> $ date_col_wday.lbl <ord> Thursday, Sunday, Monday, Thursday, Friday, Friday,…
#> $ date_col_mday <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
#> $ date_col_qday <int> 62, 1, 1, 60, 32, 32, 1, 62, 31, 63, 32, 32, 32, 31…
#> $ date_col_yday <int> 152, 274, 91, 60, 305, 305, 182, 153, 121, 244, 32,…
#> $ date_col_mweek <int> 5, 5, 6, 5, 5, 5, 5, 6, 5, 6, 5, 5, 5, 5, 5, 6, 5, …
#> $ date_col_week <int> 22, 40, 13, 9, 44, 44, 26, 22, 18, 35, 5, 31, 5, 18…
#> $ date_col_week2 <int> 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, …
#> $ date_col_week3 <int> 1, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 1, 2, 0, 1, 1, 0, …
#> $ date_col_week4 <int> 2, 0, 1, 1, 0, 0, 2, 2, 2, 3, 1, 3, 1, 2, 2, 1, 2, …
#> $ date_col_mday7 <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
Now that we have out initial recipe we can use the
pca_your_recipe()
function.
<- pca_your_recipe(
pca_list .recipe_object = rec_obj,
.data = data_tbl,
.threshold = 0.8,
.top_n = 5
)#> Warning: Column(s) have zero variance so scaling cannot be used:
#> `date_col_day`, `date_col_mday` and `date_col_mday7`. Consider using
#> `step_zv()` to remove those columns before normalizing
The function returns a list object and does so
insvisible
so you must assign the output to a variable, you
can then access the items of the list in the usual manner.
The following items are included in the output of the function:
Lets start going down the list of items.
This is the portion you will want to output to a variable as this is the recipe object itself that you will use further down the line of your work.
<- pca_list$pca_transform
pca_rec_obj
pca_rec_obj#>
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 1
#>
#> ── Operations
#> • Timeseries signature features from: date_col
#> • Variables removed: matches("(iso$)|(xts$)|(hour)|(min)|(sec)|(am.pm)")
#> • Centering for: recipes::all_numeric()
#> • Scaling for: recipes::all_numeric()
#> • Sparse, unbalanced variable filter on: recipes::all_numeric()
#> • PCA extraction with: recipes::all_numeric_predictors()
$variable_loadings
pca_list#> # A tibble: 169 × 4
#> terms value component id
#> <chr> <dbl> <chr> <chr>
#> 1 date_col_index.num -0.0702 PC1 pca_UUIWi
#> 2 date_col_year -0.0156 PC1 pca_UUIWi
#> 3 date_col_half -0.387 PC1 pca_UUIWi
#> 4 date_col_quarter -0.432 PC1 pca_UUIWi
#> 5 date_col_month -0.433 PC1 pca_UUIWi
#> 6 date_col_wday 0.0539 PC1 pca_UUIWi
#> 7 date_col_qday -0.0363 PC1 pca_UUIWi
#> 8 date_col_yday -0.433 PC1 pca_UUIWi
#> 9 date_col_mweek -0.0223 PC1 pca_UUIWi
#> 10 date_col_week -0.433 PC1 pca_UUIWi
#> # ℹ 159 more rows
$variable_variance
pca_list#> # A tibble: 52 × 4
#> terms value component id
#> <chr> <dbl> <int> <chr>
#> 1 variance 5.22 1 pca_UUIWi
#> 2 variance 2.01 2 pca_UUIWi
#> 3 variance 1.46 3 pca_UUIWi
#> 4 variance 1.25 4 pca_UUIWi
#> 5 variance 1.22 5 pca_UUIWi
#> 6 variance 0.630 6 pca_UUIWi
#> 7 variance 0.621 7 pca_UUIWi
#> 8 variance 0.543 8 pca_UUIWi
#> 9 variance 0.0519 9 pca_UUIWi
#> 10 variance 0.000199 10 pca_UUIWi
#> # ℹ 42 more rows
$pca_estimates
pca_list#>
#> ── Recipe ──────────────────────────────────────────────────────────────────────
#>
#> ── Inputs
#> Number of variables by role
#> outcome: 1
#> predictor: 1
#>
#> ── Training information
#> Training data contained 76 data points and no incomplete rows.
#>
#> ── Operations
#> • Timeseries signature features from: date_col | Trained
#> • Variables removed: date_col_year.iso, date_col_month.xts, ... | Trained
#> • Centering for: value, date_col_index.num, date_col_year, ... | Trained
#> • Scaling for: value, date_col_index.num, date_col_year, ... | Trained
#> • Sparse, unbalanced variable filter removed: date_col_day, ... | Trained
#> • PCA extraction with: date_col_index.num, date_col_year, ... | Trained
$pca_juiced_estimates %>% glimpse()
pca_list#> Rows: 76
#> Columns: 9
#> $ date_col <date> 2017-06-01, 2017-10-01, 2019-04-01, 2018-03-01, 20…
#> $ value <dbl> 0.37487163, 0.20817905, -0.39829819, 0.22236566, 0.…
#> $ date_col_month.lbl <ord> June, October, April, March, November, November, Ju…
#> $ date_col_wday.lbl <ord> Thursday, Sunday, Monday, Thursday, Friday, Friday,…
#> $ PC1 <dbl> 0.5263510, -2.7142102, 1.6098165, 2.4795466, -2.886…
#> $ PC2 <dbl> -0.35962071, -0.64910605, -1.75091951, -0.92153364,…
#> $ PC3 <dbl> -1.06456824, 0.18441200, 1.69207174, -1.68959979, -…
#> $ PC4 <dbl> -0.7367524, 1.3257431, 0.2336602, -0.8366235, 1.281…
#> $ PC5 <dbl> 0.03831523, -0.88288705, -1.31822361, -0.40607844, …
$pca_baked_data %>% glimpse()
pca_list#> Rows: 95
#> Columns: 9
#> $ date_col <date> 2013-01-01, 2013-02-01, 2013-03-01, 2013-04-01, 20…
#> $ value <dbl> 1.8680115, 0.5805774, 0.8536695, 1.0983883, 1.67649…
#> $ date_col_month.lbl <ord> January, February, March, April, May, June, July, A…
#> $ date_col_wday.lbl <ord> Tuesday, Friday, Friday, Monday, Wednesday, Saturda…
#> $ PC1 <dbl> 3.3082608, 2.9870082, 2.6909257, 1.8307756, 1.10203…
#> $ PC2 <dbl> 1.7972511, 1.8684487, 2.0782693, 1.8329967, 1.91010…
#> $ PC3 <dbl> 1.66057043, 0.08783168, -1.80843259, 1.73603741, -1…
#> $ PC4 <dbl> 1.10245427, 0.74028568, -0.24999253, 0.85737503, 0.…
#> $ PC5 <dbl> -1.17217788, 1.00619323, -0.16188733, -1.43608068, …
$pca_rotation_df %>% glimpse()
pca_list#> Rows: 13
#> Columns: 13
#> $ PC1 <dbl> -0.07018084, -0.01555417, -0.38693079, -0.43206932, -0.43313695, …
#> $ PC2 <dbl> -0.68875978, -0.69882773, 0.05974677, 0.01848655, 0.04374089, 0.0…
#> $ PC3 <dbl> -0.011315455, -0.005726013, 0.160939270, 0.040132379, -0.04602952…
#> $ PC4 <dbl> -0.12611003, -0.11541603, -0.08328288, 0.06326173, -0.09129617, 0…
#> $ PC5 <dbl> 0.023359774, 0.022275802, 0.227546691, 0.059153195, 0.008936985, …
#> $ PC6 <dbl> -0.016188471, -0.017654860, -0.084496904, -0.102971157, 0.0061802…
#> $ PC7 <dbl> 0.028183663, 0.024210177, -0.142379182, -0.030594114, 0.031473627…
#> $ PC8 <dbl> 0.02650395, 0.02274382, 0.30839416, 0.06635854, 0.02882196, -0.03…
#> $ PC9 <dbl> -0.00668247, 0.02093441, 0.79967481, -0.26996726, -0.21347946, 0.…
#> $ PC10 <dbl> -0.012401925, 0.011254839, 0.009578343, -0.305855962, -0.38192437…
#> $ PC11 <dbl> -0.0276210453, 0.0280170644, -0.0011925262, -0.0659160758, 0.6702…
#> $ PC12 <dbl> 2.756323e-03, -2.253457e-03, 2.898344e-03, 7.849322e-01, -3.96810…
#> $ PC13 <dbl> -7.080814e-01, 7.034045e-01, 1.528174e-04, 3.203706e-02, 8.787572…
$pca_variance_df %>% glimpse()
pca_list#> Rows: 13
#> Columns: 6
#> $ PC <chr> "PC1", "PC2", "PC3", "PC4", "PC5", "PC6", "PC7", "PC8"…
#> $ var_explained <dbl> 4.014345e-01, 1.544499e-01, 1.119697e-01, 9.624221e-02…
#> $ var_pct_txt <chr> "40.14%", "15.44%", "11.20%", "9.62%", "9.39%", "4.84%…
#> $ cum_var_pct <dbl> 0.4014345, 0.5558844, 0.6678540, 0.7640962, 0.8579657,…
#> $ cum_var_pct_txt <chr> "40.14%", "55.59%", "66.79%", "76.41%", "85.80%", "90.…
#> $ ou_threshold <fct> Under, Under, Under, Under, Over, Over, Over, Over, Ov…
$pca_variance_scree_plt pca_list
$pca_loadings_plt pca_list
$pca_top_n_loadings_plt pca_list