(Thanks Kat!)
tidyverse
syntaxMaybe…
If you’re using machine learning/non-parametric models
If the purpose of your model is prediction
If you’re working with a large dataset
Any time you’re fitting the same model over and over again
Any time you’re conducting the same data wrangling process over and over again
library(tidymodels)
library(stringr)
The dataset we’ll be working with is found at http://insideairbnb.com/get-the-data.html.
temp = tempfile()
download.file("http://data.insideairbnb.com/united-states/ny/new-york-city/2019-06-02/data/listings.csv.gz", temp)
airbnb_data = gzfile(temp, 'rt')%>%
read.csv()%>%
as_tibble()
nrow(airbnb_data)
## [1] 48801
head(airbnb_data)
rec = recipe(price ~ host_since +
host_response_time +
host_response_rate +
host_is_superhost +
host_listings_count +
host_has_profile_pic +
neighbourhood_group_cleansed +
zipcode +
room_type +
accommodates +
bathrooms +
bedrooms +
beds +
bed_type +
square_feet +
security_deposit +
cleaning_fee +
guests_included +
extra_people +
maximum_nights +
minimum_nights +
availability_365 +
number_of_reviews +
last_review +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value +
instant_bookable +
is_business_travel_ready +
cancellation_policy +
require_guest_profile_picture +
require_guest_phone_verification +
calculated_host_listings_count +
reviews_per_month,
data = airbnb_data)
rec
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
- `var_info`
- `term_info`
- `steps`
- `template`
- `levels`
- `retained`
rec$var_info
rec$template
rec = rec %>%
step_filter(neighbourhood_group_cleansed != "Staten Island")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
cleaning_fee = as.character(cleaning_fee)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
security_deposit = as.character(security_deposit)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
extra_people = as.character(extra_people)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
host_since = (Sys.Date()-as.Date(host_since))%>%
as.numeric(),
host_response_rate = as.character(host_response_rate)%>%
stringr::str_remove("%")%>%
as.numeric(),
host_response_time = ifelse(host_response_time == "N/A", NA, host_response_time))
rec
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Operations:
##
## Row filtering
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time
retain = TRUE
overwrites the previous templaterec = prep(rec, retain = TRUE)
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
cleaned_airbnb_data = bake(rec, new_data = airbnb_data)
cleaned_airbnb_data
rec1 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(host_since,
host_response_time,
host_is_superhost,
host_listings_count,
host_has_profile_pic),
options = list(nbagg = 5, keepX = FALSE))
rec1
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
prep(rec1)$template
step_knnimput()
step_lowerimpute()
step_meanimpute()
step_medianimpute()
step_modeimpute()
step_rollimpute()
rec2 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(starts_with("host")))
has_role()
has_type()
all_numeric()
all_predictors()
all_outcomes()
everything()
ends_with()
contains()
matches()
num_range()
one_of()
rec3 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(starts_with("host") - all_nominal()))
rec1
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
rec1 = rec1 %>%
add_role(neighbourhood_group_cleansed, new_role = "strata")
rec1
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
## strata 1
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
rec1 = rec1 %>%
step_meanimpute(starts_with("review"))%>%
step_center(starts_with("review"))%>%
step_scale(starts_with("review"))%>%
step_pca(starts_with("review"), threshold = 0.95)%>%
prep()
rec1$term_info
step_naomit()
rec1 = rec1%>%
step_naomit(all_predictors())
step_BoxCox()
rec1 = rec1%>%
step_BoxCox(price)
step_corr()
rec1 = rec1%>%
step_corr(all_numeric(), threshold = 0.5)
prep(rec1)
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
## strata 1
##
## Training data contained 48445 data points and 48232 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate [trained]
## Mean Imputation for 8 items [trained]
## Centering for 8 items [trained]
## Scaling for 8 items [trained]
## PCA extraction with 8 items [trained]
## Removing rows with NA values in all_predictors()
## Box-Cox transformation on price [trained]
## Correlation filter removed bedrooms, ... [trained]
add_step()
For for an example walking through how to create your own step, see https://tidymodels.github.io/recipes/articles/Custom_Steps.html
cleaned_airbnb_data = recipe(price ~ host_since +
host_response_time +
host_response_rate +
host_is_superhost +
host_listings_count +
host_has_profile_pic +
neighbourhood_group_cleansed +
zipcode +
room_type +
accommodates +
bathrooms +
bedrooms +
beds +
bed_type +
square_feet +
security_deposit +
cleaning_fee +
guests_included +
extra_people +
maximum_nights +
minimum_nights +
availability_365 +
number_of_reviews +
last_review +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value +
instant_bookable +
is_business_travel_ready +
cancellation_policy +
require_guest_profile_picture +
require_guest_phone_verification +
calculated_host_listings_count +
reviews_per_month,
data = airbnb_data)%>%
step_filter(neighbourhood_group_cleansed != "Staten Island")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
cleaning_fee = as.character(cleaning_fee)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
security_deposit = as.character(security_deposit)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
extra_people = as.character(extra_people)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
host_since = (Sys.Date()-as.Date(host_since))%>%
as.numeric(),
host_response_rate = as.character(host_response_rate)%>%
stringr::str_remove("%")%>%
as.numeric(),
host_response_time = ifelse(host_response_time == "N/A", NA, host_response_time))%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(host_since,
host_response_time,
host_is_superhost,
host_listings_count,
host_has_profile_pic),
options = list(nbagg = 5, keepX = FALSE))%>%
add_role(neighbourhood_group_cleansed, new_role = "strata")%>%
step_meanimpute(starts_with("review"))%>%
step_center(starts_with("review"))%>%
step_scale(starts_with("review"))%>%
step_pca(starts_with("review"), threshold = 0.95)%>%
step_naomit(everything())%>%
step_corr(all_numeric(), threshold = 0.5)%>%
step_mutate(price_greater_than_100 = factor(ifelse(price > 100, 1, 0)))%>%
prep()%>%
bake(new_data = airbnb_data)
cleaned_airbnb_data
logistic_regression = logistic_reg()%>% #define the model you're going to fit
set_engine("glm")%>% #choose the package that you want to use to fit the model
fit(price_greater_than_100 ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1, data = cleaned_airbnb_data) #make a call to the data
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = cleaned_airbnb_data %>%
select(host_is_superhost,
host_response_rate,
room_type,
host_listings_count,
number_of_reviews,
PC1),
y = cleaned_airbnb_data$price)
tidy(linear_regression, conf.int = TRUE)
recipes()
and ggplot()
- you can start specifying a model outline (“recipe”) before you make any calls to a dataset.logistic_regression = logistic_reg()%>%
set_engine("glm")
logistic_regression
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
translate()
function lets you view the syntax of the underlying modeling function.logistic_regression%>%
translate()
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
##
## Model fit template:
## stats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(),
## family = stats::binomial)
parsnip
glmnet
keras
lm
spark
stan
glm
glmnet
keras
lm
spark
stan
earth
glmnet
keras
spark
flexsurv
survreg
C5.0
spark
xgboost
randomForest
ranger
spark
C5.0
spark
rpart
keras
nnet
kknn
kernlab
parsnip
is only implemented for parametric time-to-event models (Weibull, exponential, etc.)parsnip
:(airbnb_data = airbnb_data %>%
mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric())
nrow(airbnb_data)
## [1] 48801
predict_with_lm = lm(price %>% as.character()%>%
stringr::str_remove("\\$")%>%
as.numeric() ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value,
data = airbnb_data)
length(predict(predict_with_lm))
## [1] 37489
parsnip
:)predict_with_parsnip = linear_reg()%>%
set_engine("lm")%>%
fit(price ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value,
data = airbnb_data)%>%
predict(airbnb_data)
predict_with_parsnip
parsnip
when there are multiple predictions at each observationpenalized_regression = linear_reg()%>%
set_engine("glmnet")%>%
fit(price ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value, airbnb_data)
predicted_values = penalized_regression%>%
multi_predict(airbnb_data)
predicted_values
predicted_values$.pred[2]
## [[1]]
## # A tibble: 64 x 2
## penalty .pred
## <dbl> <dbl>
## 1 0.0416 149.
## 2 0.0457 149.
## 3 0.0502 149.
## 4 0.0550 149.
## 5 0.0604 149.
## 6 0.0663 149.
## 7 0.0728 149.
## 8 0.0799 149.
## 9 0.0876 149.
## 10 0.0962 149.
## # ... with 54 more rows
unnested = predicted_values%>%
unnest()
unnested
nrow(unnested)
## [1] 3123264
parsnip
and recipes
together to conduct a stratified analysisnew_recipe = recipe(price ~ number_of_reviews +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value+
neighbourhood_group_cleansed,
data = airbnb_data)%>%
add_role(neighbourhood_group_cleansed, new_role = "strata")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric())%>%
prep()
get_linear_reg = function( recipe, data, strata, ...){
prepped_data = recipe%>%
step_filter(neighbourhood_group_cleansed == strata, role = "strata")%>%
prep()
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = bake(prepped_data, data, all_predictors(), - has_role("strata")),
y = bake(prepped_data, data, all_outcomes()))
}
get_linear_reg(new_recipe, airbnb_data, "Manhattan")%>%
tidy(conf.int = T)
recipes
and parsnip
for a nested stratified analysisget_linear_reg = function(data, recipe, ...){
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = bake(recipe, data, all_predictors(), - has_role("strata")),
y = bake(recipe, data, all_outcomes()))
}
Nest the data
strat_analysis = airbnb_data%>%
group_by(neighbourhood_group_cleansed)%>%
nest()
strat_analysis
strat_analysis = airbnb_data%>%
group_by(neighbourhood_group_cleansed)%>%
nest()%>%
mutate(stratified_models = map(data, get_linear_reg, new_recipe, airbnb_data))
strat_analysis
strat_analysis$stratified_models[[1]]%>%
tidy(conf.int = T)
set.seed(2342)
airbnb_bootstraps = bootstraps(cleaned_airbnb_data, times = 10, strata = NULL)
airbnb_bootstraps
airbnb_bootstraps$splits
## [[1]]
## <213/82/213>
##
## [[2]]
## <213/76/213>
##
## [[3]]
## <213/84/213>
##
## [[4]]
## <213/73/213>
##
## [[5]]
## <213/79/213>
##
## [[6]]
## <213/80/213>
##
## [[7]]
## <213/83/213>
##
## [[8]]
## <213/79/213>
##
## [[9]]
## <213/76/213>
##
## [[10]]
## <213/82/213>
rsplit
objectclass(airbnb_bootstraps$splits[[1]])
## [1] "rsplit" "boot_split"
analysis()
gives the bootstraped set as a dataframeanalysis(airbnb_bootstraps$splits[[1]])
assessment()
gives the out-of-bag set as a dataframeassessment(airbnb_bootstraps$splits[[1]])
set.seed(346234)
strat_airbnb_boots = bootstraps(cleaned_airbnb_data, times = 100, strata = "neighbourhood_group_cleansed")
strat_airbnb_boots
set.seed(86869)
airbnb_folds = vfold_cv(cleaned_airbnb_data, times = 10)
airbnb_folds
rolling_origin()
mc_cv()
nested_cv()
rsample2caret()
caret2rsample()
as.data.frame.rsplit()
fit_model()
as a wrapper for lm()
fit_model = function(data, formula){
lm(formula, data)%>%
tidy()
}
fit_model(cleaned_airbnb_data,
formula(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1))
fit_model()
onto bootstrap setsboot_cis = strat_airbnb_boots %>%
mutate(fitted_models = map(splits, fit_model,
formula = formula(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1)))
boot_cis
boot_cis = boot_cis %>%
select(-c(splits, id))%>%
unnest()
boot_cis
boot_cis_private_room = boot_cis%>%
filter(term == "room_typePrivate room")
quantile(boot_cis_private_room$estimate, probs = c(0.025, 0.975))
## 2.5% 97.5%
## -136.41397 -88.85459
lm(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1,
data = cleaned_airbnb_data)%>%
tidy(conf.int = T)%>%
select(term, conf.low, conf.high)