(Thanks Kat!)
tidyverse syntaxMaybe…
If you’re using machine learning/non-parametric models
If the purpose of your model is prediction
If you’re working with a large dataset
Any time you’re fitting the same model over and over again
Any time you’re conducting the same data wrangling process over and over again
library(tidymodels)
library(stringr)The dataset we’ll be working with is found at http://insideairbnb.com/get-the-data.html.
temp = tempfile()
download.file("http://data.insideairbnb.com/united-states/ny/new-york-city/2019-06-02/data/listings.csv.gz", temp)
airbnb_data = gzfile(temp, 'rt')%>%
read.csv()%>%
as_tibble()
nrow(airbnb_data)## [1] 48801
head(airbnb_data)rec = recipe(price ~ host_since +
host_response_time +
host_response_rate +
host_is_superhost +
host_listings_count +
host_has_profile_pic +
neighbourhood_group_cleansed +
zipcode +
room_type +
accommodates +
bathrooms +
bedrooms +
beds +
bed_type +
square_feet +
security_deposit +
cleaning_fee +
guests_included +
extra_people +
maximum_nights +
minimum_nights +
availability_365 +
number_of_reviews +
last_review +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value +
instant_bookable +
is_business_travel_ready +
cancellation_policy +
require_guest_profile_picture +
require_guest_phone_verification +
calculated_host_listings_count +
reviews_per_month,
data = airbnb_data)
rec## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
- `var_info`
- `term_info`
- `steps`
- `template`
- `levels`
- `retained`
rec$var_inforec$templaterec = rec %>%
step_filter(neighbourhood_group_cleansed != "Staten Island")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
cleaning_fee = as.character(cleaning_fee)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
security_deposit = as.character(security_deposit)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
extra_people = as.character(extra_people)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
host_since = (Sys.Date()-as.Date(host_since))%>%
as.numeric(),
host_response_rate = as.character(host_response_rate)%>%
stringr::str_remove("%")%>%
as.numeric(),
host_response_time = ifelse(host_response_time == "N/A", NA, host_response_time))
rec## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Operations:
##
## Row filtering
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time
retain = TRUE overwrites the previous templaterec = prep(rec, retain = TRUE)## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
## Warning in function_list[[k]](value): NAs introduced by coercion
cleaned_airbnb_data = bake(rec, new_data = airbnb_data)
cleaned_airbnb_datarec1 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(host_since,
host_response_time,
host_is_superhost,
host_listings_count,
host_has_profile_pic),
options = list(nbagg = 5, keepX = FALSE))
rec1## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
prep(rec1)$templatestep_knnimput()step_lowerimpute()step_meanimpute()step_medianimpute()step_modeimpute()step_rollimpute()rec2 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(starts_with("host")))has_role()has_type()all_numeric()all_predictors()all_outcomes()everything()ends_with()contains()matches()num_range()one_of()rec3 = rec%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(starts_with("host") - all_nominal()))rec1## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
rec1 = rec1 %>%
add_role(neighbourhood_group_cleansed, new_role = "strata")
rec1 ## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
## strata 1
##
## Training data contained 48801 data points and 48439 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate
rec1 = rec1 %>%
step_meanimpute(starts_with("review"))%>%
step_center(starts_with("review"))%>%
step_scale(starts_with("review"))%>%
step_pca(starts_with("review"), threshold = 0.95)%>%
prep()
rec1$term_infostep_naomit()rec1 = rec1%>%
step_naomit(all_predictors())step_BoxCox()rec1 = rec1%>%
step_BoxCox(price)step_corr()rec1 = rec1%>%
step_corr(all_numeric(), threshold = 0.5)
prep(rec1)## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 38
## strata 1
##
## Training data contained 48445 data points and 48232 incomplete rows.
##
## Operations:
##
## Row filtering [trained]
## Variable mutation for price, cleaning_fee, security_deposit, extra_people, host_since, host_response_rate, host_response_time [trained]
## Bagged tree imputation for host_response_rate [trained]
## Mean Imputation for 8 items [trained]
## Centering for 8 items [trained]
## Scaling for 8 items [trained]
## PCA extraction with 8 items [trained]
## Removing rows with NA values in all_predictors()
## Box-Cox transformation on price [trained]
## Correlation filter removed bedrooms, ... [trained]
add_step()For for an example walking through how to create your own step, see https://tidymodels.github.io/recipes/articles/Custom_Steps.html
cleaned_airbnb_data = recipe(price ~ host_since +
host_response_time +
host_response_rate +
host_is_superhost +
host_listings_count +
host_has_profile_pic +
neighbourhood_group_cleansed +
zipcode +
room_type +
accommodates +
bathrooms +
bedrooms +
beds +
bed_type +
square_feet +
security_deposit +
cleaning_fee +
guests_included +
extra_people +
maximum_nights +
minimum_nights +
availability_365 +
number_of_reviews +
last_review +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value +
instant_bookable +
is_business_travel_ready +
cancellation_policy +
require_guest_profile_picture +
require_guest_phone_verification +
calculated_host_listings_count +
reviews_per_month,
data = airbnb_data)%>%
step_filter(neighbourhood_group_cleansed != "Staten Island")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
cleaning_fee = as.character(cleaning_fee)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
security_deposit = as.character(security_deposit)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
extra_people = as.character(extra_people)%>%
stringr::str_remove("\\$")%>%
as.numeric(),
host_since = (Sys.Date()-as.Date(host_since))%>%
as.numeric(),
host_response_rate = as.character(host_response_rate)%>%
stringr::str_remove("%")%>%
as.numeric(),
host_response_time = ifelse(host_response_time == "N/A", NA, host_response_time))%>%
step_bagimpute(host_response_rate,
impute_with = imp_vars(host_since,
host_response_time,
host_is_superhost,
host_listings_count,
host_has_profile_pic),
options = list(nbagg = 5, keepX = FALSE))%>%
add_role(neighbourhood_group_cleansed, new_role = "strata")%>%
step_meanimpute(starts_with("review"))%>%
step_center(starts_with("review"))%>%
step_scale(starts_with("review"))%>%
step_pca(starts_with("review"), threshold = 0.95)%>%
step_naomit(everything())%>%
step_corr(all_numeric(), threshold = 0.5)%>%
step_mutate(price_greater_than_100 = factor(ifelse(price > 100, 1, 0)))%>%
prep()%>%
bake(new_data = airbnb_data)
cleaned_airbnb_datalogistic_regression = logistic_reg()%>% #define the model you're going to fit
set_engine("glm")%>% #choose the package that you want to use to fit the model
fit(price_greater_than_100 ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1, data = cleaned_airbnb_data) #make a call to the data
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = cleaned_airbnb_data %>%
select(host_is_superhost,
host_response_rate,
room_type,
host_listings_count,
number_of_reviews,
PC1),
y = cleaned_airbnb_data$price)
tidy(linear_regression, conf.int = TRUE)recipes() and ggplot() - you can start specifying a model outline (“recipe”) before you make any calls to a dataset.logistic_regression = logistic_reg()%>%
set_engine("glm")
logistic_regression## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
translate() function lets you view the syntax of the underlying modeling function.logistic_regression%>%
translate()## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
##
## Model fit template:
## stats::glm(formula = missing_arg(), data = missing_arg(), weights = missing_arg(),
## family = stats::binomial)
parsnipglmnet keras lm spark stanglm glmnet keras lm spark stanearthglmnet keras sparkflexsurv survregC5.0 spark xgboostrandomForest ranger sparkC5.0 spark rpartkeras nnetkknnkernlabparsnip is only implemented for parametric time-to-event models (Weibull, exponential, etc.)parsnip :(airbnb_data = airbnb_data %>%
mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric())
nrow(airbnb_data)## [1] 48801
predict_with_lm = lm(price %>% as.character()%>%
stringr::str_remove("\\$")%>%
as.numeric() ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value,
data = airbnb_data)
length(predict(predict_with_lm))## [1] 37489
parsnip :)predict_with_parsnip = linear_reg()%>%
set_engine("lm")%>%
fit(price ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value,
data = airbnb_data)%>%
predict(airbnb_data)
predict_with_parsnipparsnip when there are multiple predictions at each observationpenalized_regression = linear_reg()%>%
set_engine("glmnet")%>%
fit(price ~ review_scores_accuracy+
review_scores_cleanliness+
review_scores_checkin+
review_scores_communication+
review_scores_location+
review_scores_value, airbnb_data)
predicted_values = penalized_regression%>%
multi_predict(airbnb_data)
predicted_valuespredicted_values$.pred[2]## [[1]]
## # A tibble: 64 x 2
## penalty .pred
## <dbl> <dbl>
## 1 0.0416 149.
## 2 0.0457 149.
## 3 0.0502 149.
## 4 0.0550 149.
## 5 0.0604 149.
## 6 0.0663 149.
## 7 0.0728 149.
## 8 0.0799 149.
## 9 0.0876 149.
## 10 0.0962 149.
## # ... with 54 more rows
unnested = predicted_values%>%
unnest()
unnestednrow(unnested)## [1] 3123264
parsnip and recipes together to conduct a stratified analysisnew_recipe = recipe(price ~ number_of_reviews +
review_scores_rating +
review_scores_accuracy +
review_scores_cleanliness +
review_scores_checkin +
review_scores_communication +
review_scores_location +
review_scores_value+
neighbourhood_group_cleansed,
data = airbnb_data)%>%
add_role(neighbourhood_group_cleansed, new_role = "strata")%>%
step_mutate(price = as.character(price)%>%
stringr::str_remove("\\$")%>%
as.numeric())%>%
prep()get_linear_reg = function( recipe, data, strata, ...){
prepped_data = recipe%>%
step_filter(neighbourhood_group_cleansed == strata, role = "strata")%>%
prep()
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = bake(prepped_data, data, all_predictors(), - has_role("strata")),
y = bake(prepped_data, data, all_outcomes()))
}get_linear_reg(new_recipe, airbnb_data, "Manhattan")%>%
tidy(conf.int = T)recipes and parsnip for a nested stratified analysisget_linear_reg = function(data, recipe, ...){
linear_regression = linear_reg()%>%
set_engine("lm")%>%
fit_xy(x = bake(recipe, data, all_predictors(), - has_role("strata")),
y = bake(recipe, data, all_outcomes()))
}Nest the data
strat_analysis = airbnb_data%>%
group_by(neighbourhood_group_cleansed)%>%
nest()
strat_analysisstrat_analysis = airbnb_data%>%
group_by(neighbourhood_group_cleansed)%>%
nest()%>%
mutate(stratified_models = map(data, get_linear_reg, new_recipe, airbnb_data))
strat_analysisstrat_analysis$stratified_models[[1]]%>%
tidy(conf.int = T)set.seed(2342)
airbnb_bootstraps = bootstraps(cleaned_airbnb_data, times = 10, strata = NULL)
airbnb_bootstrapsairbnb_bootstraps$splits## [[1]]
## <213/82/213>
##
## [[2]]
## <213/76/213>
##
## [[3]]
## <213/84/213>
##
## [[4]]
## <213/73/213>
##
## [[5]]
## <213/79/213>
##
## [[6]]
## <213/80/213>
##
## [[7]]
## <213/83/213>
##
## [[8]]
## <213/79/213>
##
## [[9]]
## <213/76/213>
##
## [[10]]
## <213/82/213>
rsplit objectclass(airbnb_bootstraps$splits[[1]])## [1] "rsplit" "boot_split"
analysis() gives the bootstraped set as a dataframeanalysis(airbnb_bootstraps$splits[[1]])assessment() gives the out-of-bag set as a dataframeassessment(airbnb_bootstraps$splits[[1]])set.seed(346234)
strat_airbnb_boots = bootstraps(cleaned_airbnb_data, times = 100, strata = "neighbourhood_group_cleansed")
strat_airbnb_bootsset.seed(86869)
airbnb_folds = vfold_cv(cleaned_airbnb_data, times = 10)
airbnb_foldsrolling_origin()mc_cv()nested_cv()rsample2caret()caret2rsample()as.data.frame.rsplit()fit_model() as a wrapper for lm()fit_model = function(data, formula){
lm(formula, data)%>%
tidy()
}
fit_model(cleaned_airbnb_data,
formula(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1))fit_model() onto bootstrap setsboot_cis = strat_airbnb_boots %>%
mutate(fitted_models = map(splits, fit_model,
formula = formula(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1)))
boot_cisboot_cis = boot_cis %>%
select(-c(splits, id))%>%
unnest()
boot_cisboot_cis_private_room = boot_cis%>%
filter(term == "room_typePrivate room")
quantile(boot_cis_private_room$estimate, probs = c(0.025, 0.975))## 2.5% 97.5%
## -136.41397 -88.85459
lm(price ~ host_is_superhost+
host_response_rate+
room_type+
host_listings_count+
number_of_reviews+
PC1,
data = cleaned_airbnb_data)%>%
tidy(conf.int = T)%>%
select(term, conf.low, conf.high)