output |
---|
github_document |
00249_example_8.1_of_section_8.2.1.R
# example 8.1 of section 8.2.1
# (example 8.1 of section 8.2.1) : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data
# Title: Preparing the KDD data for analysis
d <- read.table('../KDD2009/orange_small_train.data.gz', # Note: 1
header = TRUE,
sep = '\t',
na.strings = c('NA', '')) # Note: 2
churn <- read.table('../KDD2009/orange_small_train_churn.labels.txt',
header = FALSE, sep = '\t') # Note: 3
d$churn <- churn$V1 # Note: 4
set.seed(729375) # Note: 5
rgroup <- base::sample(c('train', 'calibrate', 'test'), # Note: 6
nrow(d),
prob = c(0.8, 0.1, 0.1),
replace = TRUE)
dTrain <- d[rgroup == 'train', , drop = FALSE]
dCal <- d[rgroup == 'calibrate', , drop = FALSE]
dTrainAll <- d[rgroup %in% c('train', 'calibrate'), , drop = FALSE]
dTest <- d[rgroup == 'test', , drop = FALSE]
outcome <- 'churn'
vars <- setdiff(colnames(dTrainAll), outcome)
rm(list=c('d', 'churn', 'rgroup')) # Note: 7
# Note 1:
# Read the file of independent variables. All
# data from
# https://github.com/WinVector/PDSwR2/tree/master/KDD2009.
# Note 2:
# Treat both NA and the empty string as missing
# data.
# Note 3:
# Read known churn outcomes.
# Note 4:
# Add churn as a new column.
# Note 5:
# By setting the seed to the pseudo-random
# number generator, we make our work reproducible:
# someone redoing it will see the exact same
# results.
# Note 6:
# Split data into train, calibration, and test sets.
# Explicitly specify the base::sample() function to avoid
# name collision with dplyr::sample(), if the dplyr package is loaded.
# Note 7:
# Remove unneeded objects from workspace.
00250_informalexample_8.1_of_section_8.2.1.R
# informalexample 8.1 of section 8.2.1
# (informalexample 8.1 of section 8.2.1) : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data
outcome_summary <- table(
churn = dTrain[, outcome], # Note: 1
useNA = 'ifany') # Note: 2
knitr::kable(outcome_summary)
churn | Freq |
---|---|
-1 | 37110 |
1 | 2943 |
# Note 1:
# Tabulate levels of churn outcome.
# Note 2:
# Include NA values in tabulation.
00251_informalexample_8.2_of_section_8.2.1.R
# informalexample 8.2 of section 8.2.1
# (informalexample 8.2 of section 8.2.1) : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data
outcome_summary["1"] / sum(outcome_summary) # Note: 1
## 1
## 0.07347764
# 1
# 0.07347764
# Note 1:
# Estimate observed churn rate or prevalence.
00253_example_8.3_of_section_8.2.2.R
# example 8.3 of section 8.2.2
# (example 8.3 of section 8.2.2) : Advanced data preparation : KDD and KDD Cup 2009 : The bull in the china shop approach
# Title: Trying just one variable
model2 <- glm((churn == 1) ~ Var1, data = dTrainAll, family = binomial)
summary(model2)
##
## Call:
## glm(formula = (churn == 1) ~ Var1, family = binomial, data = dTrainAll)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.3685 -0.3685 -0.3685 -0.3661 2.3762
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.6556108 0.1684722 -15.763 <2e-16 ***
## Var1 -0.0008306 0.0045203 -0.184 0.854
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 303.31 on 629 degrees of freedom
## Residual deviance: 303.27 on 628 degrees of freedom
## (44395 observations deleted due to missingness)
## AIC: 307.27
##
## Number of Fisher Scoring iterations: 5
#
# Call:
# glm(formula = (churn == 1) ~ Var1, family = binomial, data = dTrainAll)
#
# Deviance Residuals:
# Min 1Q Median 3Q Max
# -0.3997 -0.3694 -0.3691 -0.3691 2.3326
#
# Coefficients:
# Estimate Std. Error z value Pr(>|z|)
# (Intercept) -2.6523837 0.1674387 -15.841 <2e-16 ***
# Var1 0.0002429 0.0035759 0.068 0.946
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# (Dispersion parameter for binomial family taken to be 1)
#
# Null deviance: 302.09 on 620 degrees of freedom
# Residual deviance: 302.08 on 619 degrees of freedom
# (44407 observations deleted due to missingness) # Note: 1
# AIC: 306.08
#
# Number of Fisher Scoring iterations: 5
dim(dTrainAll)
## [1] 45025 231
# [1] 45028 234
# Note 1:
# This means the modeling procedure threw out this much (almost all) of our training data.
00254_informalexample_8.3_of_section_8.2.2.R
# informalexample 8.3 of section 8.2.2
# (informalexample 8.3 of section 8.2.2) : Advanced data preparation : KDD and KDD Cup 2009 : The bull in the china shop approach
head(dTrainAll$Var200)
## [1] <NA> <NA> vynJTq9 <NA> 0v21jmy <NA>
## 15415 Levels: _84etK_ _9bTOWp _A3VKFm _bq4Nkb _ct4nkXBMp ... zzQ9udm
# [1] <NA> <NA> vynJTq9 <NA> 0v21jmy <NA>
# 15415 Levels: _84etK_ _9bTOWp _A3VKFm _bq4Nkb _ct4nkXBMp ... zzQ9udm
length(unique(dTrainAll$Var200))
## [1] 14418
# [1] 14391
00255_example_8.4_of_section_8.3.R
# example 8.4 of section 8.3
# (example 8.4 of section 8.3) : Advanced data preparation : Basic data preparation for classification
# Title: Basic data preparation for classification
library("vtreat") # Note: 1
(parallel_cluster <- parallel::makeCluster(parallel::detectCores())) # Note: 2
## socket cluster with 4 nodes on host 'localhost'
treatment_plan <- vtreat::designTreatmentsC( # Note: 3
dTrain,
varlist = vars,
outcomename = "churn",
outcometarget = 1,
verbose = FALSE,
parallelCluster = parallel_cluster)
# Note 1:
# Attach the vtreat package for functions such as designTreatmentsC().
# Note 2:
# Start up a parallel cluster to speed up
# calculation. If you don’t want a parallel cluster, just set
# parallel_cluster to NULL.
# Note 3:
# Use designTreatmentsC() to learn the treatment plan from the training data. For a dataset of the size and complexity of KDD2009, this can take a few minutes.
00256_example_8.5_of_section_8.3.R
# example 8.5 of section 8.3
# (example 8.5 of section 8.3) : Advanced data preparation : Basic data preparation for classification
# Title: Preparing data with vtreat
dTrain_treated <- prepare(treatment_plan,
dTrain,
parallelCluster = parallel_cluster)
head(colnames(dTrain))
## [1] "Var1" "Var2" "Var3" "Var4" "Var5" "Var6"
## [1] "Var1" "Var2" "Var3" "Var4" "Var5" "Var6"
head(colnames(dTrain_treated)) # Note: 1
## [1] "Var1" "Var1_isBAD" "Var2" "Var2_isBAD" "Var3"
## [6] "Var3_isBAD"
## [1] "Var1" "Var1_isBAD" "Var2" "Var2_isBAD" "Var3"
## [6] "Var3_isBAD"
# Note 1:
# Compare the columns of the original dTrain data to its treated counterpart.
00257_informalexample_8.4_of_section_8.3.1.R
# informalexample 8.4 of section 8.3.1
# (informalexample 8.4 of section 8.3.1) : Advanced data preparation : Basic data preparation for classification : The variable score frame
score_frame <- treatment_plan$scoreFrame
t(subset(score_frame, origName %in% c("Var126", "Var189")))
## 225 226 341
## varName "Var126" "Var126_isBAD" "Var189"
## varMoves "TRUE" "TRUE" "TRUE"
## rsq "0.0030859179" "0.0136377093" "0.0118934515"
## sig "7.876602e-16" "2.453679e-64" "2.427376e-56"
## needsSplit "FALSE" "FALSE" "FALSE"
## extraModelDegrees "0" "0" "0"
## origName "Var126" "Var126" "Var189"
## code "clean" "isBAD" "clean"
## 342
## varName "Var189_isBAD"
## varMoves "TRUE"
## rsq "0.0001004614"
## sig "1.460688e-01"
## needsSplit "FALSE"
## extraModelDegrees "0"
## origName "Var189"
## code "isBAD"
# 225 226 341 342
# varName "Var126" "Var126_isBAD" "Var189" "Var189_isBAD" # Note: 1
# varMoves "TRUE" "TRUE" "TRUE" "TRUE" # Note: 2
# rsq "0.0030859179" "0.0136377093" "0.0118934515" "0.0001004614" # Note: 3
# sig "7.876602e-16" "2.453679e-64" "2.427376e-56" "1.460688e-01" # Note: 4
# needsSplit "FALSE" "FALSE" "FALSE" "FALSE" # Note: 5
# extraModelDegrees "0" "0" "0" "0" # Note: 6
# origName "Var126" "Var126" "Var189" "Var189" # Note: 7
# code "clean" "isBAD" "clean" "isBAD" # Note: 8
# Note 1:
# The name of the derived variable or column.
# Note 2:
# An indicator that this variable is not always the same value (not a constant, which would be useless for modeling).
# Note 3:
# The R-squared or pseudo R-squared of the
# variable, what fraction of the outcome variation
# this variable can explain on its own in a linear
# model.
# Note 4:
# The significance of the estimated
# R-squared.
# Note 5:
# An indicator that when TRUE is a warning to the user that the variable is
# hiding extra degrees of freedom (a measure of model complexity) and needs to be evaluated using cross-validation techniques..
# Note 6:
# How complex is the variable, for a categorical variable this is related to the number of levels.
# Note 7:
# Name of the original column the variable was derived from.
# Note 8:
# Name of the type of transformation used to build this variable.
00258_informalexample_8.5_of_section_8.3.1.R
# informalexample 8.5 of section 8.3.1
# (informalexample 8.5 of section 8.3.1) : Advanced data preparation : Basic data preparation for classification : The variable score frame
t(subset(score_frame, origName == "Var218"))
## 389 390 488
## varName "Var218_catP" "Var218_catB" "Var218_lev_x_cJvF"
## varMoves "TRUE" "TRUE" "TRUE"
## rsq "0.010955895" "0.012709736" "0.005295590"
## sig "4.836496e-52" "4.394981e-60" "4.902238e-26"
## needsSplit "TRUE" "TRUE" "FALSE"
## extraModelDegrees "2" "2" "0"
## origName "Var218" "Var218" "Var218"
## code "catP" "catB" "lev"
## 489
## varName "Var218_lev_x_UYBR"
## varMoves "TRUE"
## rsq "0.001970131"
## sig "1.218959e-10"
## needsSplit "FALSE"
## extraModelDegrees "0"
## origName "Var218"
## code "lev"
# 389 390 488 489
# varName "Var218_catP" "Var218_catB" "Var218_lev_x_cJvF" "Var218_lev_x_UYBR"
# varMoves "TRUE" "TRUE" "TRUE" "TRUE"
# rsq "0.011014574" "0.012245152" "0.005295590" "0.001970131"
# sig "2.602574e-52" "5.924945e-58" "4.902238e-26" "1.218959e-10"
# needsSplit " TRUE" " TRUE" "FALSE" "FALSE"
# extraModelDegrees "2" "2" "0" "0"
# origName "Var218" "Var218" "Var218" "Var218"
# code "catP" "catB" "lev" "lev"
00259_informalexample_8.6_of_section_8.3.1.R
# informalexample 8.6 of section 8.3.1
# (informalexample 8.6 of section 8.3.1) : Advanced data preparation : Basic data preparation for classification : The variable score frame
comparison <- data.frame(original218 = dTrain$Var218,
impact218 = dTrain_treated$Var218_catB)
head(comparison)
## original218 impact218
## 1 cJvF -0.2180735
## 2 <NA> 1.5155125
## 3 UYBR 0.1221393
## 4 UYBR 0.1221393
## 5 UYBR 0.1221393
## 6 UYBR 0.1221393
## original218 impact218
## 1 cJvF -0.2180735
## 2 <NA> 1.5155125
## 3 UYBR 0.1221393
## 4 UYBR 0.1221393
## 5 UYBR 0.1221393
## 6 UYBR 0.1221393
00260_informalexample_8.7_of_section_8.3.1.R
# informalexample 8.7 of section 8.3.1
# (informalexample 8.7 of section 8.3.1) : Advanced data preparation : Basic data preparation for classification : The variable score frame
treatment_plan_2 <- design_missingness_treatment(dTrain, varlist = vars) # Note: 1
dtrain_2 <- prepare(treatment_plan_2, dTrain) # Note: 2
head(dtrain_2$Var218)
## [1] "cJvF" "_invalid_" "UYBR" "UYBR" "UYBR" "UYBR"
## [1] "cJvF" "_invalid_" "UYBR" "UYBR" "UYBR" "UYBR"
model <- glm(churn ==1 ~ Var218, # Note: 3
data = dtrain_2,
family = "binomial")
pred <- predict(model, # Note: 4
newdata = dtrain_2,
type = "response")
(prevalence <- mean(dTrain$churn == 1) ) # Note: 5
## [1] 0.07347764
## [1] 0.07347764
logit <- function(p) { # Note: 6
log (p / (1-p) )
}
comparison$glm218 <- logit(pred) - logit(prevalence) # Note: 7
head(comparison)
## original218 impact218 glm218
## 1 cJvF -0.2180735 -0.2180735
## 2 <NA> 1.5155125 1.5155121
## 3 UYBR 0.1221393 0.1221392
## 4 UYBR 0.1221393 0.1221392
## 5 UYBR 0.1221393 0.1221392
## 6 UYBR 0.1221393 0.1221392
## original218 impact218 glm218
## 1 cJvF -0.2180735 -0.2180735 # Note: 8
## 2 <NA> 1.5155125 1.5155121
## 3 UYBR 0.1221393 0.1221392
## 4 UYBR 0.1221393 0.1221392
## 5 UYBR 0.1221393 0.1221392
## 6 UYBR 0.1221393 0.1221392
# Note 1:
# Simple treatment to turn NA into a safe string.
# Note 2:
# Create the treated data.
# Note 3:
# Fit the one-variable logistic regression model.
# Note 4:
# Make predictions on the data.
# Note 5:
# Calculate the global probability of churn.
# Note 6:
# A function to calculate the logit, or log-odds of a probability.
# Note 7:
# Calculate the catB values by hand.
# Note 8:
# Notice the impact codes
# from vtreat match the “delta logit” encoded
# predictions from the standard glm model. This helps
# illustrate how vtreat is implemented.
00261_informalexample_8.8_of_section_8.3.1.R
# informalexample 8.8 of section 8.3.1
# (informalexample 8.8 of section 8.3.1) : Advanced data preparation : Basic data preparation for classification : The variable score frame
score_frame[score_frame$origName == "Var200", , drop = FALSE]
## varName varMoves rsq sig needsSplit
## 361 Var200_catP TRUE 0.005729292 4.930851e-28 TRUE
## 362 Var200_catB TRUE 0.000945847 8.192687e-06 TRUE
## 428 Var200_lev_NA TRUE 0.005729838 4.902365e-28 FALSE
## extraModelDegrees origName code
## 361 13323 Var200 catP
## 362 13323 Var200 catB
## 428 0 Var200 lev
# varName varMoves rsq sig needsSplit extraModelDegrees origName code
# 361 Var200_catP TRUE 0.005729835 4.902546e-28 TRUE 13323 Var200 catP
# 362 Var200_catB TRUE 0.001476298 2.516703e-08 TRUE 13323 Var200 catB
# 428 Var200_lev_NA TRUE 0.005729838 4.902365e-28 FALSE 0 Var200 lev
00262_informalexample_8.9_of_section_8.3.2.R
# informalexample 8.9 of section 8.3.2
# (informalexample 8.9 of section 8.3.2) : Advanced data preparation : Basic data preparation for classification : Properly using the treatment plan
dCal_treated <- prepare(treatment_plan,
dCal,
parallelCluster = parallel_cluster)
00263_informalexample_8.10_of_section_8.3.2.R
# informalexample 8.10 of section 8.3.2
# (informalexample 8.10 of section 8.3.2) : Advanced data preparation : Basic data preparation for classification : Properly using the treatment plan
library("sigr")
calcAUC(dTrain_treated$Var200_catB, dTrain_treated$churn)
## [1] 0.8279249
# [1] 0.8279249
calcAUC(dCal_treated$Var200_catB, dCal_treated$churn)
## [1] 0.5505401
# [1] 0.5505401
00264_example_8.6_of_section_8.4.1.R
# example 8.6 of section 8.4.1
# (example 8.6 of section 8.4.1) : Advanced data preparation : Advanced data preparation for classification : Using mkCrossFrameCExperiment()
# Title: Advanced data preparation for classification
library("vtreat")
parallel_cluster <- parallel::makeCluster(parallel::detectCores())
cross_frame_experiment <- vtreat::mkCrossFrameCExperiment(
dTrainAll,
varlist = vars,
outcomename = "churn",
outcometarget = 1,
verbose = FALSE,
parallelCluster = parallel_cluster)
dTrainAll_treated <- cross_frame_experiment$crossFrame # Note: 1
treatment_plan <- cross_frame_experiment$treatments
score_frame <- treatment_plan$scoreFrame
dTest_treated <- prepare(treatment_plan, # Note: 2
dTest,
parallelCluster = parallel_cluster)
# Note 1:
# We will use the cross-frame to train the logistic regression model.
# Note 2:
# Prepare the test set so we can call the model on it.
00265_informalexample_8.11_of_section_8.4.1.R
# informalexample 8.11 of section 8.4.1
# (informalexample 8.11 of section 8.4.1) : Advanced data preparation : Advanced data preparation for classification : Using mkCrossFrameCExperiment()
library("sigr")
calcAUC(dTrainAll_treated$Var200_catB, dTrainAll_treated$churn)
## [1] 0.5496616
# [1] 0.5450466
calcAUC(dTest_treated$Var200_catB, dTest_treated$churn)
## [1] 0.5290295
# [1] 0.5290295
00266_informalexample_8.12_of_section_8.4.2.R
# informalexample 8.12 of section 8.4.2
# (informalexample 8.12 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
k <- 1 # Note: 1
(significance_cutoff <- k / nrow(score_frame))
## [1] 0.001831502
# [1] 0.001831502
score_frame$selected <- score_frame$sig < significance_cutoff
suppressPackageStartupMessages(library("dplyr")) # Note: 2
score_frame %>%
group_by(., code, selected) %>%
summarize(.,
count = n()) %>%
ungroup(.) %>%
cdata::pivot_to_rowrecs(.,
columnToTakeKeysFrom = 'selected',
columnToTakeValuesFrom = 'count',
rowKeyColumns = 'code',
sep = '=')
## # A tibble: 5 x 3
## code `selected=FALSE` `selected=TRUE`
## <chr> <int> <int>
## 1 catB 11 22
## 2 catP 7 26
## 3 clean 158 15
## 4 isBAD 60 111
## 5 lev 74 62
# # A tibble: 5 x 3
# code `selected=FALSE` `selected=TRUE`
# <chr> <int> <int>
# 1 catB 12 21
# 2 catP 7 26
# 3 clean 158 15
# 4 isBAD 60 111
# 5 lev 74 62
# Note 1:
# Use our filter significances at k / nrow(score_frame) heuristic with k = 1
# Note 2:
# Bring in the dplyr package to help summarize the selections.
00267_example_8.7_of_section_8.4.2.R
# example 8.7 of section 8.4.2
# (example 8.7 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
# Title: Basic variable re-coding and selection
library("wrapr")
##
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
##
## coalesce
newvars <- score_frame$varName[score_frame$selected] # Note: 1
f <- mk_formula("churn", newvars, outcome_target = 1) # Note: 2
model <- glm(f, data = dTrainAll_treated, family = binomial) # Note: 3
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Warning message:
# glm.fit: fitted probabilities numerically 0 or 1 occurred
# Note 1:
# Build a formula specifying modeling churn == 1 as a function of all variables.
# Note 2:
# Use the modeling formula with R’s glm() function.
# Note 3:
# Take heed of this warning: it is hinting we should move on to a regularized method such as
# glmnet.
00268_informalexample_8.13_of_section_8.4.2.R
# informalexample 8.13 of section 8.4.2
# (informalexample 8.13 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
library("sigr")
dTest_treated$glm_pred <- predict(model, # Note: 1
newdata = dTest_treated,
type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
# Warning message: # Note: 2
# In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type == :
# prediction from a rank-deficient fit may be misleading
calcAUC(dTest_treated$glm_pred, dTest_treated$churn == 1) # Note: 3
## [1] 0.7231226
## [1] 0.7232192
permTestAUC(dTest_treated, "glm_pred", "churn", yTarget = 1) # Note: 4
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7231, s.d.=0.01626, p<1e-05)."
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7232, s.d.=0.01535, p<1e-05)."
var_aucs <- vapply(newvars, # Note: 5
function(vi) {
calcAUC(dTrainAll_treated[[vi]], dTrainAll_treated$churn == 1)
}, numeric(1))
(best_train_aucs <- var_aucs[var_aucs >= max(var_aucs)])
## Var216_catB
## 0.5892631
## Var216_catB
## 0.5873512
# Note 1:
# Add the model prediction to the evaluation data as a new column.
# Note 2:
# Again, take heed of this warning: it is hinting we should
# move on to a regularized method such as
# glmnet.
# Note 3:
# Calculate the AUC of the model on hold-out data.
# Note 4:
# Calculate the AUC a second time, using an alternative method that also estimates a standard deviation or error-bar.
# Note 5:
# Here we calculate the best single variable model AUC for comparison.
00269_informalexample_8.14_of_section_8.4.2.R
# informalexample 8.14 of section 8.4.2
# (informalexample 8.14 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
table(prediction = dTest_treated$glm_pred >= 0.5,
truth = dTest$churn)
## truth
## prediction -1 1
## FALSE 4591 375
## TRUE 8 1
# truth
# prediction -1 1
# FALSE 4591 375
# TRUE 8 1
00270_informalexample_8.15_of_section_8.4.2.R
# informalexample 8.15 of section 8.4.2
# (informalexample 8.15 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
table(prediction = dTest_treated$glm_pred>0.15,
truth = dTest$churn)
## truth
## prediction -1 1
## FALSE 4247 266
## TRUE 352 110
# truth
# prediction -1 1
# FALSE 4243 266
# TRUE 356 110
00271_informalexample_8.16_of_section_8.4.2.R
# informalexample 8.16 of section 8.4.2
# (informalexample 8.16 of section 8.4.2) : Advanced data preparation : Advanced data preparation for classification : Building a model
WVPlots::DoubleDensityPlot(dTest_treated, "glm_pred", "churn",
"glm prediction on test, double density plot")
WVPlots::PRTPlot(dTest_treated, "glm_pred", "churn",
"glm prediction on test, enrichment plot",
truthTarget = 1,
plotvars = c("enrichment", "recall"),
thresholdrange = c(0, 1.0))
00272_informalexample_8.17_of_section_8.5.R
# informalexample 8.17 of section 8.5
# (informalexample 8.17 of section 8.5) : Advanced data preparation : Preparing data for regression modeling
auto_mpg <- readRDS('../auto_mpg/auto_mpg.RDS')
knitr::kable(head(auto_mpg)) # Note: 1
mpg | cylinders | displacement | horsepower | weight | acceleration | model_year | origin | car_name |
---|---|---|---|---|---|---|---|---|
18 | 8 | 307 | 130 | 3504 | 12.0 | 70 | 1 | "chevrolet chevelle malibu" |
15 | 8 | 350 | 165 | 3693 | 11.5 | 70 | 1 | "buick skylark 320" |
18 | 8 | 318 | 150 | 3436 | 11.0 | 70 | 1 | "plymouth satellite" |
16 | 8 | 304 | 150 | 3433 | 12.0 | 70 | 1 | "amc rebel sst" |
17 | 8 | 302 | 140 | 3449 | 10.5 | 70 | 1 | "ford torino" |
15 | 8 | 429 | 198 | 4341 | 10.0 | 70 | 1 | "ford galaxie 500" |
# Note 1:
# Take a quick look at the data.
00273_informalexample_8.18_of_section_8.5.R
# informalexample 8.18 of section 8.5
# (informalexample 8.18 of section 8.5) : Advanced data preparation : Preparing data for regression modeling
library("wrapr")
vars <- c("cylinders", "displacement", # Note: 1
"horsepower", "weight", "acceleration",
"model_year", "origin")
f <- mk_formula("mpg", vars)
model <- lm(f, data = auto_mpg)
auto_mpg$prediction <- predict(model, newdata = auto_mpg) # Note: 2
str(auto_mpg[!complete.cases(auto_mpg), , drop = FALSE])
## 'data.frame': 6 obs. of 10 variables:
## $ mpg : num 25 21 40.9 23.6 34.5 23
## $ cylinders : num 4 6 4 4 4 4
## $ displacement: num 98 200 85 140 100 151
## $ horsepower : num NA NA NA NA NA NA
## $ weight : num 2046 2875 1835 2905 2320 ...
## $ acceleration: num 19 17 17.3 14.3 15.8 20.5
## $ model_year : num 71 74 80 80 81 82
## $ origin : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
## $ car_name : chr "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" "\"ford mustang cobra\"" ...
## $ prediction : num NA NA NA NA NA NA
# 'data.frame': 6 obs. of 10 variables:
# $ mpg : num 25 21 40.9 23.6 34.5 23
# $ cylinders : num 4 6 4 4 4 4
# $ displacement: num 98 200 85 140 100 151
# $ horsepower : num NA NA NA NA NA NA # Note: 3
# $ weight : num 2046 2875 1835 2905 2320 ...
# $ acceleration: num 19 17 17.3 14.3 15.8 20.5
# $ model_year : num 71 74 80 80 81 82
# $ origin : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
# $ car_name : chr "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" ...
# $ prediction : num NA NA NA NA NA NA # Note: 4
# Note 1:
# Jump into modeling without bothering to treat the data.
# Note 2:
# Add the model predictions as a new column.
# Note 3:
# Notice these cars do not have a recorded horsepower.
# Note 4:
# So these cars do not get a prediction.
00274_informalexample_8.19_of_section_8.5.R
# informalexample 8.19 of section 8.5
# (informalexample 8.19 of section 8.5) : Advanced data preparation : Preparing data for regression modeling
library("vtreat")
cfe <- mkCrossFrameNExperiment(auto_mpg, vars, "mpg", # Note: 1
verbose = FALSE)
treatment_plan <- cfe$treatments
auto_mpg_treated <- cfe$crossFrame
score_frame <- treatment_plan$scoreFrame
new_vars <- score_frame$varName
newf <- mk_formula("mpg", new_vars)
new_model <- lm(newf, data = auto_mpg_treated)
auto_mpg$prediction <- predict(new_model, newdata = auto_mpg_treated)
## Warning in predict.lm(new_model, newdata = auto_mpg_treated): prediction
## from a rank-deficient fit may be misleading
# Warning in predict.lm(new_model, newdata = auto_mpg_treated): prediction
# from a rank-deficient fit may be misleading
str(auto_mpg[!complete.cases(auto_mpg), , drop = FALSE])
## 'data.frame': 6 obs. of 10 variables:
## $ mpg : num 25 21 40.9 23.6 34.5 23
## $ cylinders : num 4 6 4 4 4 4
## $ displacement: num 98 200 85 140 100 151
## $ horsepower : num NA NA NA NA NA NA
## $ weight : num 2046 2875 1835 2905 2320 ...
## $ acceleration: num 19 17 17.3 14.3 15.8 20.5
## $ model_year : num 71 74 80 80 81 82
## $ origin : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
## $ car_name : chr "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" "\"ford mustang cobra\"" ...
## $ prediction : num 24.5 22.5 35 26 32.6 ...
# 'data.frame': 6 obs. of 10 variables:
# $ mpg : num 25 21 40.9 23.6 34.5 23
# $ cylinders : num 4 6 4 4 4 4
# $ displacement: num 98 200 85 140 100 151
# $ horsepower : num NA NA NA NA NA NA
# $ weight : num 2046 2875 1835 2905 2320 ...
# $ acceleration: num 19 17 17.3 14.3 15.8 20.5
# $ model_year : num 71 74 80 80 81 82
# $ origin : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
# $ car_name : chr "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" ...
# $ prediction : num 24.6 22.4 34.2 26.1 33.3 ... # Note: 2
# Note 1:
# Try it again with vtreat data preparation.
# Note 2:
# Now we can make predictions, even for items that have missing data.
00275_informalexample_8.20_of_section_8.6.2.R
# informalexample 8.20 of section 8.6.2
# (informalexample 8.20 of section 8.6.2) : Advanced data preparation : Mastering the vtreat package : Missing values
library("wrapr") # Note: 1
d <- build_frame(
"x1" , "x2" , "x3", "y" |
1 , "a" , 6 , 10 |
NA_real_, "b" , 7 , 20 |
3 , NA_character_, 8 , 30 )
knitr::kable(d)
x1 | x2 | x3 | y |
---|---|---|---|
1 | a | 6 | 10 |
NA | b | 7 | 20 |
3 | NA | 8 | 30 |
# Note 1:
# Bring in the wrapr package for build_frame and the
# wrapr “dot pipe”.
00276_informalexample_8.21_of_section_8.6.2.R
# informalexample 8.21 of section 8.6.2
# (informalexample 8.21 of section 8.6.2) : Advanced data preparation : Mastering the vtreat package : Missing values
plan1 <- vtreat::design_missingness_treatment(d)
vtreat::prepare(plan1, d) %.>% # Note: 1
knitr::kable(.)
x1 | x1_isBAD | x2 | x3 | y |
---|---|---|---|---|
1 | 0 | a | 6 | 10 |
2 | 1 | b | 7 | 20 |
3 | 0 | invalid | 8 | 30 |
# Note 1:
# Here we are using wrapr’s dot pipe instead of
# magrittr’s forward pipe. The dot pipe requires the
# “explicit dot argument” notation discussed in
# chapter 5.
00277_informalexample_8.22_of_section_8.6.3.R
# informalexample 8.22 of section 8.6.3
# (informalexample 8.22 of section 8.6.3) : Advanced data preparation : Mastering the vtreat package : Indicator variables
d <- build_frame(
"x1" , "x2" , "x3", "y" |
1 , "a" , 6 , 10 |
NA_real_, "b" , 7 , 20 |
3 , NA_character_, 8 , 30 )
print(d)
## x1 x2 x3 y
## 1 1 a 6 10
## 2 NA b 7 20
## 3 3 <NA> 8 30
# x1 x2 x3 y
# 1 1 a 6 10
# 2 NA b 7 20 # Note: 1
# 3 3 <NA> 8 30
plan2 <- vtreat::designTreatmentsZ(d,
varlist = c("x1", "x2", "x3"),
verbose = FALSE)
vtreat::prepare(plan2, d)
## x1 x1_isBAD x3 x2_lev_NA x2_lev_x_a x2_lev_x_b
## 1 1 0 6 0 1 0
## 2 2 1 7 0 0 1
## 3 3 0 8 1 0 0
# x1 x1_isBAD x3 x2_lev_NA x2_lev_x_a x2_lev_x_b
# 1 1 0 6 0 1 0
# 2 2 1 7 0 0 1 # Note: 2
# 3 3 0 8 1 0 0
# Note 1:
# The second value of x2 is “b”
# Note 2:
# In the second row of the treated data, x2_lev_x_b = 1
00278_informalexample_8.23_of_section_8.6.4.R
# informalexample 8.23 of section 8.6.4
# (informalexample 8.23 of section 8.6.4) : Advanced data preparation : Mastering the vtreat package : Impact coding
d <- build_frame(
"x1" , "x2" , "x3", "y" |
1 , "a" , 6 , 10 |
NA_real_, "b" , 7 , 20 |
3 , NA_character_, 8 , 30 )
print(d)
## x1 x2 x3 y
## 1 1 a 6 10
## 2 NA b 7 20
## 3 3 <NA> 8 30
# x1 x2 x3 y
# 1 1 a 6 10
# 2 NA b 7 20
# 3 3 <NA> 8 30
plan3 <- vtreat::designTreatmentsN(d,
varlist = c("x1", "x2", "x3"),
outcomename = "y",
codeRestriction = "catN",
verbose = FALSE)
vtreat::prepare(plan3, d)
## x2_catN y
## 1 -10 10
## 2 0 20
## 3 10 30
# x2_catN y
# 1 -10 10
# 2 0 20
# 3 10 30
00279_informalexample_8.24_of_section_8.6.4.R
# informalexample 8.24 of section 8.6.4
# (informalexample 8.24 of section 8.6.4) : Advanced data preparation : Mastering the vtreat package : Impact coding
plan4 <- vtreat::designTreatmentsC(d,
varlist = c("x1", "x2", "x3"),
outcomename = "y",
outcometarget = 20,
codeRestriction = "catB",
verbose = FALSE)
vtreat::prepare(plan4, d)
## x2_catB y
## 1 -8.517343 10
## 2 9.903538 20
## 3 -8.517343 30
# x2_catB y
# 1 -8.517343 10
# 2 9.903538 20
# 3 -8.517343 30
00280_informalexample_8.25_of_section_8.6.5.R
# informalexample 8.25 of section 8.6.5
# (informalexample 8.25 of section 8.6.5) : Advanced data preparation : Mastering the vtreat package : The treatment plan
class(plan4)
## [1] "treatmentplan"
# [1] "treatmentplan"
names(plan4)
## [1] "treatments" "scoreFrame" "outcomename" "vtreatVersion"
## [5] "outcomeType" "outcomeTarget" "meanY" "splitmethod"
# [1] "treatments" "scoreFrame" "outcomename" "vtreatVersion" "outcomeType"
# [6] "outcomeTarget" "meanY" "splitmethod"
00281_informalexample_8.26_of_section_8.6.5.R
# informalexample 8.26 of section 8.6.5
# (informalexample 8.26 of section 8.6.5) : Advanced data preparation : Mastering the vtreat package : The treatment plan
plan4$scoreFrame
## varName varMoves rsq sig needsSplit extraModelDegrees origName
## 1 x2_catB TRUE 1 0.0506719 TRUE 2 x2
## code
## 1 catB
# varName varMoves rsq sig needsSplit extraModelDegrees origName code
# 1 x2_catB TRUE 1 0.0506719 TRUE 2 x2 catB
00282_example_8.8_of_section_8.6.6.R
# example 8.8 of section 8.6.6
# (example 8.8 of section 8.6.6) : Advanced data preparation : Mastering the vtreat package : The cross-frame
# Title: An information-free dataset
set.seed(2019) # Note: 1
d <- data.frame( # Note: 2
x_bad = sample(letters, 100, replace = TRUE),
y = rnorm(100),
stringsAsFactors = FALSE
)
d$x_good <- ifelse(d$y > rnorm(100), "non-neg", "neg") # Note: 3
head(d) # Note: 4
## x_bad y x_good
## 1 y -0.7603575 non-neg
## 2 j 0.4442418 non-neg
## 3 e 1.7386856 non-neg
## 4 m -0.7752029 non-neg
## 5 q -1.1825636 neg
## 6 x -0.3140285 non-neg
# x_bad y x_good
# 1 u -0.05294738 non-neg
# 2 s -0.23639840 neg
# 3 h -0.33796351 non-neg
# 4 q -0.75548467 non-neg
# 5 b -0.86159347 neg
# 6 b -0.52766549 non-neg
# Note 1:
# Set pseudo-random number generator seed to make example reproducible.
# Note 2:
# Build example data where there is no relation between x_bad and y.
# Note 3:
# x_good is a noisy prediction of the sign of y, so it does have some information about y.
# Note 4:
# Take a look at our synthetic example data. The idea is: y is related to x_good in a noisy fashion, but unrelated to x_bad.
# In this case, we know what variables should be chosen, so we can tell if our acceptance procedure is working correctly.
00283_example_8.9_of_section_8.6.6.R
# example 8.9 of section 8.6.6
# (example 8.9 of section 8.6.6) : Advanced data preparation : Mastering the vtreat package : The cross-frame
# Title: The dangers of re-using data
plan5 <- vtreat::designTreatmentsN(d, # Note: 1
varlist = c("x_bad", "x_good"),
outcomename = "y",
codeRestriction = "catN",
minFraction = 2,
verbose = FALSE)
class(plan5)
## [1] "treatmentplan"
# [1] "treatmentplan"
print(plan5) # Note: 2
## origName varName code rsq sig extraModelDegrees
## 1 x_bad x_bad_catN catN 9.93962e-05 9.215776e-01 25
## 2 x_good x_good_catN catN 3.25805e-01 5.631959e-10 1
# origName varName code rsq sig extraModelDegrees
# 1 x_bad x_bad_catN catN 4.906903e-05 9.448548e-01 24
# 2 x_good x_good_catN catN 2.602702e-01 5.895285e-08 1
training_data1 <- vtreat::prepare(plan5, d) # Note: 3
res1 <- vtreat::patch_columns_into_frame(d, training_data1) # Note: 4
head(res1)
## x_bad x_good x_bad_catN x_good_catN y
## 1 y non-neg 0.2879701 0.523549 -0.7603575
## 2 j non-neg -0.1957772 0.523549 0.4442418
## 3 e non-neg 0.1720338 0.523549 1.7386856
## 4 m non-neg -0.3295562 0.523549 -0.7752029
## 5 q neg -0.5767783 -0.590385 -1.1825636
## 6 x non-neg -0.2259024 0.523549 -0.3140285
# x_bad x_good x_bad_catN x_good_catN y
# 1 u non-neg 0.4070979 0.4305195 -0.05294738
# 2 s neg -0.1133011 -0.5706886 -0.23639840
# 3 h non-neg -0.3202346 0.4305195 -0.33796351
# 4 q non-neg -0.5447443 0.4305195 -0.75548467
# 5 b neg -0.3890076 -0.5706886 -0.86159347
# 6 b non-neg -0.3890076 0.4305195 -0.52766549
sigr::wrapFTest(res1, "x_good_catN", "y") # Note: 5
## [1] "F Test summary: (R2=0.3241, F(1,98)=46.98, p<1e-05)."
# [1] "F Test summary: (R2=0.2717, F(1,98)=36.56, p<1e-05)."
sigr::wrapFTest(res1, "x_bad_catN", "y") # Note: 6
## [1] "F Test summary: (R2=0.1943, F(1,98)=23.64, p<1e-05)."
# [1] "F Test summary: (R2=0.2342, F(1,98)=29.97, p<1e-05)."
# Note 1:
# Design a variable treatment plan using x_bad and x_good to try predicting y.
# Note 2:
# Notice the derived variable x_good_catN comes out as having a significant signal, and x_bad_catN does not.
# This is due to the proper use of cross-validation in the vtreat quality estimates.
# Note 3:
# Call prepare() on the same data used to design the treatment
# plan—this is not always safe, as we shall
# see.
# Note 4:
# Combine the data frames d an training_data1, using training_data1 when there are columns with duplicate names.
# Note 5:
# Use a statistical F-test to check the predictive power of x_good_catN.
# Note 6:
# x_bad_catN’s F-test is inflated and falsely looks
# significant, this is due to failure to use cross validated methods.
00284_example_8.10_of_section_8.6.6.R
# example 8.10 of section 8.6.6
# (example 8.10 of section 8.6.6) : Advanced data preparation : Mastering the vtreat package : The cross-frame
# Title: Using mkCrossFrameNExperiment()
cfe <- vtreat::mkCrossFrameNExperiment(d,
varlist = c("x_bad", "x_good"),
outcomename = "y",
codeRestriction = "catN",
minFraction = 2,
verbose = FALSE)
plan6 <- cfe$treatments
training_data2 <- cfe$crossFrame
res2 <- vtreat::patch_columns_into_frame(d, training_data2)
head(res2)
## x_bad x_good x_bad_catN x_good_catN y
## 1 y non-neg 0.7808611 0.4535235 -0.7603575
## 2 j non-neg -1.4621185 0.4535235 0.4442418
## 3 e non-neg -0.4041591 0.5424189 1.7386856
## 4 m non-neg -0.2240972 0.5772362 -0.7752029
## 5 q neg -1.1043780 -0.7107558 -1.1825636
## 6 x non-neg 0.0000000 0.5424189 -0.3140285
# x_bad x_good x_bad_catN x_good_catN y
# 1 u non-neg 0.2834739 0.4193180 -0.05294738
# 2 s neg -0.1085887 -0.6212118 -0.23639840
# 3 h non-neg 0.0000000 0.5095586 -0.33796351
# 4 q non-neg -0.5142570 0.5095586 -0.75548467
# 5 b neg -0.3540889 -0.6212118 -0.86159347
# 6 b non-neg -0.3540889 0.4193180 -0.52766549
sigr::wrapFTest(res2, "x_bad_catN", "y")
## [1] "F Test summary: (R2=-0.6653, F(1,98)=-39.15, p=n.s.)."
# [1] "F Test summary: (R2=-0.1389, F(1,98)=-11.95, p=n.s.)."
sigr::wrapFTest(res2, "x_good_catN", "y")
## [1] "F Test summary: (R2=0.3039, F(1,98)=42.78, p<1e-05)."
# [1] "F Test summary: (R2=0.2532, F(1,98)=33.22, p<1e-05)."
plan6$scoreFrame # Note: 1
## varName varMoves rsq sig needsSplit extraModelDegrees
## 1 x_bad_catN TRUE 0.0806097 4.201189e-03 TRUE 25
## 2 x_good_catN TRUE 0.3117822 1.576683e-09 TRUE 1
## origName code
## 1 x_bad catN
## 2 x_good catN
# varName varMoves rsq sig needsSplit
# 1 x_bad_catN TRUE 0.01436145 2.349865e-01 TRUE
# 2 x_good_catN TRUE 0.26478467 4.332649e-08 TRUE
# extraModelDegrees origName code
# 1 24 x_bad catN
# 2 1 x_good catN
# Note 1:
# The F-tests on the data and the scoreFrame statistics now largely agree.