Skip to content

Latest commit

 

History

History
1724 lines (1205 loc) · 44.3 KB

c08_Advanced_Data_Preparation.md

File metadata and controls

1724 lines (1205 loc) · 44.3 KB
output
github_document

00249_example_8.1_of_section_8.2.1.R

# example 8.1 of section 8.2.1 
# (example 8.1 of section 8.2.1)  : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data 
# Title: Preparing the KDD data for analysis 

d <- read.table('../KDD2009/orange_small_train.data.gz',             	# Note: 1 
   header = TRUE,
   sep = '\t',
   na.strings = c('NA', ''))                                  	# Note: 2 
                                                
churn <- read.table('../KDD2009/orange_small_train_churn.labels.txt',
   header = FALSE, sep = '\t')                             	# Note: 3 
d$churn <- churn$V1                                     	# Note: 4 

set.seed(729375)                                         	# Note: 5 
rgroup <- base::sample(c('train', 'calibrate', 'test'), 	# Note: 6 
   nrow(d), 
   prob = c(0.8, 0.1, 0.1),
   replace = TRUE)
dTrain <- d[rgroup == 'train', , drop = FALSE]
dCal <- d[rgroup == 'calibrate', , drop = FALSE]
dTrainAll <- d[rgroup %in% c('train', 'calibrate'), , drop = FALSE]
dTest <- d[rgroup == 'test', , drop = FALSE]
                                                
outcome <- 'churn' 
vars <- setdiff(colnames(dTrainAll), outcome)

rm(list=c('d', 'churn', 'rgroup'))                         	# Note: 7

# Note 1: 
#   Read the file of independent variables. All 
#   data from 
#   https://github.com/WinVector/PDSwR2/tree/master/KDD2009. 

# Note 2: 
#   Treat both NA and the empty string as missing 
#   data. 

# Note 3: 
#   Read known churn outcomes. 

# Note 4: 
#   Add churn as a new column. 

# Note 5: 
#   By setting the seed to the pseudo-random 
#   number generator, we make our work reproducible: 
#   someone redoing it will see the exact same 
#   results. 

# Note 6: 
#   Split data into train, calibration, and test sets. 
#   Explicitly specify the base::sample() function to avoid 
#   name collision with dplyr::sample(), if the dplyr package is loaded. 

# Note 7: 
#   Remove unneeded objects from workspace. 

00250_informalexample_8.1_of_section_8.2.1.R

# informalexample 8.1 of section 8.2.1 
# (informalexample 8.1 of section 8.2.1)  : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data 

outcome_summary <- table(
   churn = dTrain[, outcome],                	# Note: 1 
   useNA = 'ifany')                               	# Note: 2 

knitr::kable(outcome_summary)
churn Freq
-1 37110
1 2943
# Note 1: 
#   Tabulate levels of churn outcome. 

# Note 2: 
#   Include NA values in tabulation. 

00251_informalexample_8.2_of_section_8.2.1.R

# informalexample 8.2 of section 8.2.1 
# (informalexample 8.2 of section 8.2.1)  : Advanced data preparation : KDD and KDD Cup 2009 : Getting started with KDD Cup 2009 data 

outcome_summary["1"] / sum(outcome_summary) 	# Note: 1 
##          1 
## 0.07347764
#          1 
# 0.07347764

# Note 1: 
#   Estimate observed churn rate or prevalence. 

00253_example_8.3_of_section_8.2.2.R

# example 8.3 of section 8.2.2 
# (example 8.3 of section 8.2.2)  : Advanced data preparation : KDD and KDD Cup 2009 : The bull in the china shop approach 
# Title: Trying just one variable 

model2 <- glm((churn == 1) ~ Var1, data = dTrainAll, family = binomial)
summary(model2)
## 
## Call:
## glm(formula = (churn == 1) ~ Var1, family = binomial, data = dTrainAll)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.3685  -0.3685  -0.3685  -0.3661   2.3762  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.6556108  0.1684722 -15.763   <2e-16 ***
## Var1        -0.0008306  0.0045203  -0.184    0.854    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 303.31  on 629  degrees of freedom
## Residual deviance: 303.27  on 628  degrees of freedom
##   (44395 observations deleted due to missingness)
## AIC: 307.27
## 
## Number of Fisher Scoring iterations: 5
# 
# Call:
# glm(formula = (churn == 1) ~ Var1, family = binomial, data = dTrainAll)
# 
# Deviance Residuals: 
#     Min       1Q   Median       3Q      Max  
# -0.3997  -0.3694  -0.3691  -0.3691   2.3326  
# 
# Coefficients:
#               Estimate Std. Error z value Pr(>|z|)    
# (Intercept) -2.6523837  0.1674387 -15.841   <2e-16 ***
# Var1         0.0002429  0.0035759   0.068    0.946    
# ---
# Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 
# (Dispersion parameter for binomial family taken to be 1)
# 
#     Null deviance: 302.09  on 620  degrees of freedom
# Residual deviance: 302.08  on 619  degrees of freedom
#   (44407 observations deleted due to missingness)                  	# Note: 1 
# AIC: 306.08
# 
# Number of Fisher Scoring iterations: 5

dim(dTrainAll)
## [1] 45025   231
# [1] 45028   234

# Note 1: 
#   This means the modeling procedure threw out this much (almost all) of our training data. 

00254_informalexample_8.3_of_section_8.2.2.R

# informalexample 8.3 of section 8.2.2 
# (informalexample 8.3 of section 8.2.2)  : Advanced data preparation : KDD and KDD Cup 2009 : The bull in the china shop approach 

head(dTrainAll$Var200)
## [1] <NA>    <NA>    vynJTq9 <NA>    0v21jmy <NA>   
## 15415 Levels: _84etK_ _9bTOWp _A3VKFm _bq4Nkb _ct4nkXBMp ... zzQ9udm
# [1] <NA>    <NA>    vynJTq9 <NA>    0v21jmy <NA>   
# 15415 Levels: _84etK_ _9bTOWp _A3VKFm _bq4Nkb _ct4nkXBMp ... zzQ9udm  

length(unique(dTrainAll$Var200))
## [1] 14418
# [1] 14391

00255_example_8.4_of_section_8.3.R

# example 8.4 of section 8.3 
# (example 8.4 of section 8.3)  : Advanced data preparation : Basic data preparation for classification 
# Title: Basic data preparation for classification 

library("vtreat")                                               	# Note: 1 

(parallel_cluster <- parallel::makeCluster(parallel::detectCores()))  	# Note: 2 
## socket cluster with 4 nodes on host 'localhost'
treatment_plan <- vtreat::designTreatmentsC(                        	# Note: 3 
  dTrain, 
  varlist = vars, 
  outcomename = "churn", 
  outcometarget = 1, 
  verbose = FALSE,
  parallelCluster = parallel_cluster)

# Note 1: 
#   Attach the vtreat package for functions such as designTreatmentsC(). 

# Note 2: 
#   Start up a parallel cluster to speed up 
#   calculation. If you don’t want a parallel cluster, just set 
#   parallel_cluster to NULL. 

# Note 3: 
#   Use designTreatmentsC() to learn the treatment plan from the training data. For a dataset of the size and complexity of KDD2009, this can take a few minutes. 

00256_example_8.5_of_section_8.3.R

# example 8.5 of section 8.3 
# (example 8.5 of section 8.3)  : Advanced data preparation : Basic data preparation for classification 
# Title: Preparing data with vtreat 

dTrain_treated <- prepare(treatment_plan, 
                          dTrain,
                          parallelCluster = parallel_cluster)
                        
head(colnames(dTrain))
## [1] "Var1" "Var2" "Var3" "Var4" "Var5" "Var6"
## [1] "Var1" "Var2" "Var3" "Var4" "Var5" "Var6"
head(colnames(dTrain_treated))                          	# Note: 1 
## [1] "Var1"       "Var1_isBAD" "Var2"       "Var2_isBAD" "Var3"      
## [6] "Var3_isBAD"
## [1] "Var1"       "Var1_isBAD" "Var2"       "Var2_isBAD" "Var3"      
## [6] "Var3_isBAD"

# Note 1: 
#   Compare the columns of the original dTrain data to its treated counterpart. 

00257_informalexample_8.4_of_section_8.3.1.R

# informalexample 8.4 of section 8.3.1 
# (informalexample 8.4 of section 8.3.1)  : Advanced data preparation : Basic data preparation for classification : The variable score frame 

score_frame <-  treatment_plan$scoreFrame
t(subset(score_frame, origName %in% c("Var126", "Var189")))
##                   225            226            341           
## varName           "Var126"       "Var126_isBAD" "Var189"      
## varMoves          "TRUE"         "TRUE"         "TRUE"        
## rsq               "0.0030859179" "0.0136377093" "0.0118934515"
## sig               "7.876602e-16" "2.453679e-64" "2.427376e-56"
## needsSplit        "FALSE"        "FALSE"        "FALSE"       
## extraModelDegrees "0"            "0"            "0"           
## origName          "Var126"       "Var126"       "Var189"      
## code              "clean"        "isBAD"        "clean"       
##                   342           
## varName           "Var189_isBAD"
## varMoves          "TRUE"        
## rsq               "0.0001004614"
## sig               "1.460688e-01"
## needsSplit        "FALSE"       
## extraModelDegrees "0"           
## origName          "Var189"      
## code              "isBAD"
#                   225            226            341            342           
# varName           "Var126"       "Var126_isBAD" "Var189"       "Var189_isBAD"  	# Note: 1 
# varMoves          "TRUE"         "TRUE"         "TRUE"         "TRUE"        	# Note: 2 
# rsq               "0.0030859179" "0.0136377093" "0.0118934515" "0.0001004614" 	# Note: 3 
# sig               "7.876602e-16" "2.453679e-64" "2.427376e-56" "1.460688e-01" 	# Note: 4 
# needsSplit        "FALSE"        "FALSE"        "FALSE"        "FALSE"       	# Note: 5 
# extraModelDegrees "0"            "0"            "0"            "0"           	# Note: 6 
# origName          "Var126"       "Var126"       "Var189"       "Var189"      	# Note: 7 
# code              "clean"        "isBAD"        "clean"        "isBAD"          	# Note: 8

# Note 1: 
#   The name of the derived variable or column. 

# Note 2: 
#   An indicator that this variable is not always the same value (not a constant, which would be useless for modeling). 

# Note 3: 
#   The R-squared or pseudo R-squared of the 
#   variable, what fraction of the outcome variation 
#   this variable can explain on its own in a linear 
#   model. 

# Note 4: 
#   The significance of the estimated 
#   R-squared. 

# Note 5: 
#   An indicator that when TRUE is a warning to the user that the variable is  
#   hiding extra degrees of freedom (a measure of model complexity) and needs to be evaluated using cross-validation techniques.. 

# Note 6: 
#   How complex is the variable, for a categorical variable this is related to the number of levels. 

# Note 7: 
#   Name of the original column the variable was derived from. 

# Note 8: 
#   Name of the type of transformation used to build this variable. 

00258_informalexample_8.5_of_section_8.3.1.R

# informalexample 8.5 of section 8.3.1 
# (informalexample 8.5 of section 8.3.1)  : Advanced data preparation : Basic data preparation for classification : The variable score frame 

t(subset(score_frame, origName == "Var218"))
##                   389            390            488                
## varName           "Var218_catP"  "Var218_catB"  "Var218_lev_x_cJvF"
## varMoves          "TRUE"         "TRUE"         "TRUE"             
## rsq               "0.010955895"  "0.012709736"  "0.005295590"      
## sig               "4.836496e-52" "4.394981e-60" "4.902238e-26"     
## needsSplit        "TRUE"         "TRUE"         "FALSE"            
## extraModelDegrees "2"            "2"            "0"                
## origName          "Var218"       "Var218"       "Var218"           
## code              "catP"         "catB"         "lev"              
##                   489                
## varName           "Var218_lev_x_UYBR"
## varMoves          "TRUE"             
## rsq               "0.001970131"      
## sig               "1.218959e-10"     
## needsSplit        "FALSE"            
## extraModelDegrees "0"                
## origName          "Var218"           
## code              "lev"
#                   389            390            488                 489                
# varName           "Var218_catP"  "Var218_catB"  "Var218_lev_x_cJvF" "Var218_lev_x_UYBR"
# varMoves          "TRUE"         "TRUE"         "TRUE"              "TRUE"             
# rsq               "0.011014574"  "0.012245152"  "0.005295590"       "0.001970131"      
# sig               "2.602574e-52" "5.924945e-58" "4.902238e-26"      "1.218959e-10"     
# needsSplit        " TRUE"        " TRUE"        "FALSE"             "FALSE"            
# extraModelDegrees "2"            "2"            "0"                 "0"                
# origName          "Var218"       "Var218"       "Var218"            "Var218"           
# code              "catP"         "catB"         "lev"               "lev"

00259_informalexample_8.6_of_section_8.3.1.R

# informalexample 8.6 of section 8.3.1 
# (informalexample 8.6 of section 8.3.1)  : Advanced data preparation : Basic data preparation for classification : The variable score frame 

comparison <- data.frame(original218 = dTrain$Var218,
                         impact218 = dTrain_treated$Var218_catB) 

head(comparison)
##   original218  impact218
## 1        cJvF -0.2180735
## 2        <NA>  1.5155125
## 3        UYBR  0.1221393
## 4        UYBR  0.1221393
## 5        UYBR  0.1221393
## 6        UYBR  0.1221393
 ##   original218  impact218
## 1        cJvF -0.2180735
## 2        <NA>  1.5155125
## 3        UYBR  0.1221393
## 4        UYBR  0.1221393
## 5        UYBR  0.1221393
## 6        UYBR  0.1221393

00260_informalexample_8.7_of_section_8.3.1.R

# informalexample 8.7 of section 8.3.1 
# (informalexample 8.7 of section 8.3.1)  : Advanced data preparation : Basic data preparation for classification : The variable score frame 

treatment_plan_2 <- design_missingness_treatment(dTrain, varlist = vars) 	# Note: 1 
dtrain_2 <- prepare(treatment_plan_2, dTrain)                           	# Note: 2 
head(dtrain_2$Var218)
## [1] "cJvF"      "_invalid_" "UYBR"      "UYBR"      "UYBR"      "UYBR"
## [1] "cJvF"      "_invalid_" "UYBR"      "UYBR"      "UYBR"      "UYBR"

model <- glm(churn ==1  ~ Var218,         	# Note: 3 
            data = dtrain_2, 
            family = "binomial")
            
pred <- predict(model,                 	# Note: 4 
               newdata = dtrain_2, 
               type = "response")  
               
(prevalence <- mean(dTrain$churn == 1) )  	# Note: 5 
## [1] 0.07347764
## [1] 0.07347764

logit <- function(p) {                	# Note: 6 
  log (p / (1-p) )
}

comparison$glm218 <- logit(pred) - logit(prevalence)  	# Note: 7 
head(comparison)
##   original218  impact218     glm218
## 1        cJvF -0.2180735 -0.2180735
## 2        <NA>  1.5155125  1.5155121
## 3        UYBR  0.1221393  0.1221392
## 4        UYBR  0.1221393  0.1221392
## 5        UYBR  0.1221393  0.1221392
## 6        UYBR  0.1221393  0.1221392
##   original218  impact218     glm218
## 1        cJvF -0.2180735 -0.2180735  	# Note: 8 
## 2        <NA>  1.5155125  1.5155121
## 3        UYBR  0.1221393  0.1221392
## 4        UYBR  0.1221393  0.1221392
## 5        UYBR  0.1221393  0.1221392
## 6        UYBR  0.1221393  0.1221392

# Note 1: 
#   Simple treatment to turn NA into a safe string. 

# Note 2: 
#   Create the treated data. 

# Note 3: 
#   Fit the one-variable logistic regression model. 

# Note 4: 
#   Make predictions on the data. 

# Note 5: 
#   Calculate the global probability of churn. 

# Note 6: 
#   A function to calculate the logit, or log-odds of a probability. 

# Note 7: 
#   Calculate the catB values by hand. 

# Note 8: 
#   Notice the impact codes 
#   from vtreat match the “delta logit” encoded 
#   predictions from the standard glm model. This helps 
#   illustrate how vtreat is implemented. 

00261_informalexample_8.8_of_section_8.3.1.R

# informalexample 8.8 of section 8.3.1 
# (informalexample 8.8 of section 8.3.1)  : Advanced data preparation : Basic data preparation for classification : The variable score frame 

score_frame[score_frame$origName == "Var200", , drop = FALSE]
##           varName varMoves         rsq          sig needsSplit
## 361   Var200_catP     TRUE 0.005729292 4.930851e-28       TRUE
## 362   Var200_catB     TRUE 0.000945847 8.192687e-06       TRUE
## 428 Var200_lev_NA     TRUE 0.005729838 4.902365e-28      FALSE
##     extraModelDegrees origName code
## 361             13323   Var200 catP
## 362             13323   Var200 catB
## 428                 0   Var200  lev
#           varName varMoves         rsq          sig needsSplit extraModelDegrees origName code
# 361   Var200_catP     TRUE 0.005729835 4.902546e-28       TRUE             13323   Var200 catP
# 362   Var200_catB     TRUE 0.001476298 2.516703e-08       TRUE             13323   Var200 catB
# 428 Var200_lev_NA     TRUE 0.005729838 4.902365e-28      FALSE                 0   Var200  lev

00262_informalexample_8.9_of_section_8.3.2.R

# informalexample 8.9 of section 8.3.2 
# (informalexample 8.9 of section 8.3.2)  : Advanced data preparation : Basic data preparation for classification : Properly using the treatment plan 

dCal_treated <- prepare(treatment_plan, 
                        dCal,
                        parallelCluster = parallel_cluster)

00263_informalexample_8.10_of_section_8.3.2.R

# informalexample 8.10 of section 8.3.2 
# (informalexample 8.10 of section 8.3.2)  : Advanced data preparation : Basic data preparation for classification : Properly using the treatment plan 

library("sigr")

calcAUC(dTrain_treated$Var200_catB, dTrain_treated$churn)
## [1] 0.8279249
# [1] 0.8279249

calcAUC(dCal_treated$Var200_catB, dCal_treated$churn)
## [1] 0.5505401
# [1] 0.5505401

00264_example_8.6_of_section_8.4.1.R

# example 8.6 of section 8.4.1 
# (example 8.6 of section 8.4.1)  : Advanced data preparation : Advanced data preparation for classification : Using mkCrossFrameCExperiment() 
# Title: Advanced data preparation for classification 

library("vtreat")

parallel_cluster <- parallel::makeCluster(parallel::detectCores())

cross_frame_experiment <- vtreat::mkCrossFrameCExperiment(
  dTrainAll, 
  varlist = vars, 
  outcomename = "churn", 
  outcometarget = 1, 
  verbose = FALSE,
  parallelCluster = parallel_cluster)

dTrainAll_treated <- cross_frame_experiment$crossFrame 	# Note: 1 
treatment_plan <- cross_frame_experiment$treatments
score_frame <- treatment_plan$scoreFrame

dTest_treated <- prepare(treatment_plan, 	# Note: 2 
                         dTest,
                         parallelCluster = parallel_cluster)

# Note 1: 
#   We will use the cross-frame to train the logistic regression model. 

# Note 2: 
#   Prepare the test set so we can call the model on it. 

00265_informalexample_8.11_of_section_8.4.1.R

# informalexample 8.11 of section 8.4.1 
# (informalexample 8.11 of section 8.4.1)  : Advanced data preparation : Advanced data preparation for classification : Using mkCrossFrameCExperiment() 

library("sigr")

calcAUC(dTrainAll_treated$Var200_catB, dTrainAll_treated$churn)
## [1] 0.5496616
# [1] 0.5450466

calcAUC(dTest_treated$Var200_catB, dTest_treated$churn)
## [1] 0.5290295
# [1] 0.5290295

00266_informalexample_8.12_of_section_8.4.2.R

# informalexample 8.12 of section 8.4.2 
# (informalexample 8.12 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 

k <- 1       	# Note: 1 
(significance_cutoff <- k / nrow(score_frame))
## [1] 0.001831502
# [1] 0.001831502
score_frame$selected <- score_frame$sig < significance_cutoff
                                

suppressPackageStartupMessages(library("dplyr")) 	# Note: 2 

score_frame %>%
  group_by(., code, selected) %>%
  summarize(., 
            count = n()) %>%
  ungroup(.) %>%
  cdata::pivot_to_rowrecs(., 
                          columnToTakeKeysFrom = 'selected',
                          columnToTakeValuesFrom = 'count',
                          rowKeyColumns = 'code',
                          sep = '=') 
## # A tibble: 5 x 3
##   code  `selected=FALSE` `selected=TRUE`
##   <chr>            <int>           <int>
## 1 catB                11              22
## 2 catP                 7              26
## 3 clean              158              15
## 4 isBAD               60             111
## 5 lev                 74              62
# # A tibble: 5 x 3
#   code  `selected=FALSE` `selected=TRUE`
#   <chr>            <int>           <int>
# 1 catB                12              21
# 2 catP                 7              26
# 3 clean              158              15
# 4 isBAD               60             111
# 5 lev                 74              62

# Note 1: 
#   Use our filter significances at k / nrow(score_frame) heuristic with k = 1 

# Note 2: 
#   Bring in the dplyr package to help summarize the selections. 

00267_example_8.7_of_section_8.4.2.R

# example 8.7 of section 8.4.2 
# (example 8.7 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 
# Title: Basic variable re-coding and selection 

library("wrapr")
## 
## Attaching package: 'wrapr'
## The following object is masked from 'package:dplyr':
## 
##     coalesce
newvars <- score_frame$varName[score_frame$selected]             	# Note: 1 

f <- mk_formula("churn", newvars, outcome_target = 1)       	# Note: 2 
model <- glm(f, data = dTrainAll_treated, family = binomial)    	# Note: 3 
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Warning message:
# glm.fit: fitted probabilities numerically 0 or 1 occurred

# Note 1: 
#   Build a formula specifying modeling churn == 1 as a function of all variables. 

# Note 2: 
#   Use the modeling formula with R’s glm() function. 

# Note 3: 
#   Take heed of this warning: it is hinting we should move on to a regularized method such as 
#   glmnet. 

00268_informalexample_8.13_of_section_8.4.2.R

# informalexample 8.13 of section 8.4.2 
# (informalexample 8.13 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 

library("sigr")

dTest_treated$glm_pred <- predict(model,                                   	# Note: 1 
                                  newdata = dTest_treated, 
                                  type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading
# Warning message:                                                         	# Note: 2 
# In predict.lm(object, newdata, se.fit, scale = 1, type = ifelse(type ==  :
#   prediction from a rank-deficient fit may be misleading

calcAUC(dTest_treated$glm_pred, dTest_treated$churn == 1)                	# Note: 3 
## [1] 0.7231226
## [1] 0.7232192

permTestAUC(dTest_treated, "glm_pred", "churn", yTarget = 1)     	# Note: 4 
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7231, s.d.=0.01626, p<1e-05)."
## [1] "AUC test alt. hyp. AUC>AUC(permuted): (AUC=0.7232, s.d.=0.01535, p<1e-05)."
       
var_aucs <- vapply(newvars,                                              	# Note: 5 
       function(vi) {
         calcAUC(dTrainAll_treated[[vi]], dTrainAll_treated$churn == 1)
       }, numeric(1))
(best_train_aucs <- var_aucs[var_aucs >= max(var_aucs)])
## Var216_catB 
##   0.5892631
## Var216_catB 
##   0.5873512

# Note 1: 
#   Add the model prediction to the evaluation data as a new column. 

# Note 2: 
#   Again, take heed of this warning: it is hinting we should 
#   move on to a regularized method such as 
#   glmnet. 

# Note 3: 
#   Calculate the AUC of the model on hold-out data. 

# Note 4: 
#   Calculate the AUC a second time, using an alternative method that also estimates a standard deviation or error-bar. 

# Note 5: 
#   Here we calculate the best single variable model AUC for comparison. 

00269_informalexample_8.14_of_section_8.4.2.R

# informalexample 8.14 of section 8.4.2 
# (informalexample 8.14 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 

table(prediction = dTest_treated$glm_pred >= 0.5, 
      truth = dTest$churn)
##           truth
## prediction   -1    1
##      FALSE 4591  375
##      TRUE     8    1
#           truth
# prediction   -1    1
#      FALSE 4591  375
#      TRUE     8    1

00270_informalexample_8.15_of_section_8.4.2.R

# informalexample 8.15 of section 8.4.2 
# (informalexample 8.15 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 

table(prediction = dTest_treated$glm_pred>0.15, 
      truth = dTest$churn)
##           truth
## prediction   -1    1
##      FALSE 4247  266
##      TRUE   352  110
#           truth
# prediction   -1    1
#      FALSE 4243  266
#      TRUE   356  110

00271_informalexample_8.16_of_section_8.4.2.R

# informalexample 8.16 of section 8.4.2 
# (informalexample 8.16 of section 8.4.2)  : Advanced data preparation : Advanced data preparation for classification : Building a model 

WVPlots::DoubleDensityPlot(dTest_treated, "glm_pred", "churn", 
                           "glm prediction on test, double density plot")

plot of chunk 00271_informalexample_8.16_of_section_8.4.2.R

WVPlots::PRTPlot(dTest_treated, "glm_pred", "churn", 
                 "glm prediction on test, enrichment plot",
                 truthTarget = 1,
                 plotvars = c("enrichment", "recall"),
                 thresholdrange = c(0, 1.0))

plot of chunk 00271_informalexample_8.16_of_section_8.4.2.R

00272_informalexample_8.17_of_section_8.5.R

# informalexample 8.17 of section 8.5 
# (informalexample 8.17 of section 8.5)  : Advanced data preparation : Preparing data for regression modeling 

auto_mpg <- readRDS('../auto_mpg/auto_mpg.RDS')

knitr::kable(head(auto_mpg)) 	# Note: 1
mpg cylinders displacement horsepower weight acceleration model_year origin car_name
18 8 307 130 3504 12.0 70 1 "chevrolet chevelle malibu"
15 8 350 165 3693 11.5 70 1 "buick skylark 320"
18 8 318 150 3436 11.0 70 1 "plymouth satellite"
16 8 304 150 3433 12.0 70 1 "amc rebel sst"
17 8 302 140 3449 10.5 70 1 "ford torino"
15 8 429 198 4341 10.0 70 1 "ford galaxie 500"
# Note 1: 
#   Take a quick look at the data. 

00273_informalexample_8.18_of_section_8.5.R

# informalexample 8.18 of section 8.5 
# (informalexample 8.18 of section 8.5)  : Advanced data preparation : Preparing data for regression modeling 

library("wrapr")

vars <- c("cylinders", "displacement",                       	# Note: 1 
          "horsepower", "weight", "acceleration",
          "model_year", "origin")
f <- mk_formula("mpg", vars)
model <- lm(f, data = auto_mpg)

auto_mpg$prediction <- predict(model, newdata = auto_mpg)            	# Note: 2 

str(auto_mpg[!complete.cases(auto_mpg), , drop = FALSE])
## 'data.frame':	6 obs. of  10 variables:
##  $ mpg         : num  25 21 40.9 23.6 34.5 23
##  $ cylinders   : num  4 6 4 4 4 4
##  $ displacement: num  98 200 85 140 100 151
##  $ horsepower  : num  NA NA NA NA NA NA
##  $ weight      : num  2046 2875 1835 2905 2320 ...
##  $ acceleration: num  19 17 17.3 14.3 15.8 20.5
##  $ model_year  : num  71 74 80 80 81 82
##  $ origin      : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
##  $ car_name    : chr  "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" "\"ford mustang cobra\"" ...
##  $ prediction  : num  NA NA NA NA NA NA
# 'data.frame':    6 obs. of  10 variables:
#  $ mpg         : num  25 21 40.9 23.6 34.5 23
#  $ cylinders   : num  4 6 4 4 4 4
#  $ displacement: num  98 200 85 140 100 151
#  $ horsepower  : num  NA NA NA NA NA NA                            	# Note: 3 
#  $ weight      : num  2046 2875 1835 2905 2320 ...
#  $ acceleration: num  19 17 17.3 14.3 15.8 20.5
#  $ model_year  : num  71 74 80 80 81 82
#  $ origin      : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
#  $ car_name    : chr  "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" ...
#  $ prediction  : num  NA NA NA NA NA NA                            	# Note: 4

# Note 1: 
#   Jump into modeling without bothering to treat the data. 

# Note 2: 
#   Add the model predictions as a new column. 

# Note 3: 
#   Notice these cars do not have a recorded horsepower. 

# Note 4: 
#   So these cars do not get a prediction. 

00274_informalexample_8.19_of_section_8.5.R

# informalexample 8.19 of section 8.5 
# (informalexample 8.19 of section 8.5)  : Advanced data preparation : Preparing data for regression modeling 

library("vtreat")

cfe <- mkCrossFrameNExperiment(auto_mpg, vars, "mpg",           	# Note: 1 
                               verbose = FALSE)
treatment_plan <- cfe$treatments
auto_mpg_treated <- cfe$crossFrame
score_frame <- treatment_plan$scoreFrame
new_vars <- score_frame$varName

newf <- mk_formula("mpg", new_vars)
new_model <- lm(newf, data = auto_mpg_treated)

auto_mpg$prediction <- predict(new_model, newdata = auto_mpg_treated)
## Warning in predict.lm(new_model, newdata = auto_mpg_treated): prediction
## from a rank-deficient fit may be misleading
# Warning in predict.lm(new_model, newdata = auto_mpg_treated): prediction
# from a rank-deficient fit may be misleading
str(auto_mpg[!complete.cases(auto_mpg), , drop = FALSE])
## 'data.frame':	6 obs. of  10 variables:
##  $ mpg         : num  25 21 40.9 23.6 34.5 23
##  $ cylinders   : num  4 6 4 4 4 4
##  $ displacement: num  98 200 85 140 100 151
##  $ horsepower  : num  NA NA NA NA NA NA
##  $ weight      : num  2046 2875 1835 2905 2320 ...
##  $ acceleration: num  19 17 17.3 14.3 15.8 20.5
##  $ model_year  : num  71 74 80 80 81 82
##  $ origin      : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
##  $ car_name    : chr  "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" "\"ford mustang cobra\"" ...
##  $ prediction  : num  24.5 22.5 35 26 32.6 ...
# 'data.frame':    6 obs. of  10 variables:
#  $ mpg         : num  25 21 40.9 23.6 34.5 23
#  $ cylinders   : num  4 6 4 4 4 4
#  $ displacement: num  98 200 85 140 100 151
#  $ horsepower  : num  NA NA NA NA NA NA
#  $ weight      : num  2046 2875 1835 2905 2320 ...
#  $ acceleration: num  19 17 17.3 14.3 15.8 20.5
#  $ model_year  : num  71 74 80 80 81 82
#  $ origin      : Factor w/ 3 levels "1","2","3": 1 1 2 1 2 1
#  $ car_name    : chr  "\"ford pinto\"" "\"ford maverick\"" "\"renault lecar deluxe\"" ...
#  $ prediction  : num  24.6 22.4 34.2 26.1 33.3 ...               	# Note: 2

# Note 1: 
#   Try it again with vtreat data preparation. 

# Note 2: 
#   Now we can make predictions, even for items that have missing data. 

00275_informalexample_8.20_of_section_8.6.2.R

# informalexample 8.20 of section 8.6.2 
# (informalexample 8.20 of section 8.6.2)  : Advanced data preparation : Mastering the vtreat package : Missing values 

library("wrapr")                       	# Note: 1 

d <- build_frame(
   "x1"    , "x2"         , "x3", "y" |
   1       , "a"          , 6   , 10  |
   NA_real_, "b"          , 7   , 20  |
   3       , NA_character_, 8   , 30  )

knitr::kable(d)
x1 x2 x3 y
1 a 6 10
NA b 7 20
3 NA 8 30
# Note 1: 
#   Bring in the wrapr package for build_frame and the 
#   wrapr “dot pipe”. 

00276_informalexample_8.21_of_section_8.6.2.R

# informalexample 8.21 of section 8.6.2 
# (informalexample 8.21 of section 8.6.2)  : Advanced data preparation : Mastering the vtreat package : Missing values 

plan1 <- vtreat::design_missingness_treatment(d)
vtreat::prepare(plan1, d) %.>%            	# Note: 1 
   knitr::kable(.)
x1 x1_isBAD x2 x3 y
1 0 a 6 10
2 1 b 7 20
3 0 invalid 8 30
# Note 1: 
#   Here we are using wrapr’s dot pipe instead of 
#   magrittr’s forward pipe. The dot pipe requires the 
#   “explicit dot argument” notation discussed in 
#   chapter 5. 

00277_informalexample_8.22_of_section_8.6.3.R

# informalexample 8.22 of section 8.6.3 
# (informalexample 8.22 of section 8.6.3)  : Advanced data preparation : Mastering the vtreat package : Indicator variables 

d <- build_frame(
   "x1"    , "x2"         , "x3", "y" |
   1       , "a"          , 6   , 10  |
   NA_real_, "b"          , 7   , 20  |
   3       , NA_character_, 8   , 30  )

print(d)
##   x1   x2 x3  y
## 1  1    a  6 10
## 2 NA    b  7 20
## 3  3 <NA>  8 30
#   x1   x2 x3  y
# 1  1    a  6 10
# 2 NA    b  7 20                                                 	# Note: 1 
# 3  3 <NA>  8 30
plan2 <- vtreat::designTreatmentsZ(d, 
                                   varlist = c("x1", "x2", "x3"),
                                   verbose = FALSE)
vtreat::prepare(plan2, d)
##   x1 x1_isBAD x3 x2_lev_NA x2_lev_x_a x2_lev_x_b
## 1  1        0  6         0          1          0
## 2  2        1  7         0          0          1
## 3  3        0  8         1          0          0
#   x1 x1_isBAD x3 x2_lev_NA x2_lev_x_a x2_lev_x_b
# 1  1        0  6         0          1          0
# 2  2        1  7         0          0          1                  	# Note: 2 
# 3  3        0  8         1          0          0

# Note 1: 
#   The second value of x2 is “b” 

# Note 2: 
#   In the second row of the treated data, x2_lev_x_b = 1 

00278_informalexample_8.23_of_section_8.6.4.R

# informalexample 8.23 of section 8.6.4 
# (informalexample 8.23 of section 8.6.4)  : Advanced data preparation : Mastering the vtreat package : Impact coding 

d <- build_frame(
   "x1"    , "x2"         , "x3", "y" |
   1       , "a"          , 6   , 10  |
   NA_real_, "b"          , 7   , 20  |
   3       , NA_character_, 8   , 30  )

print(d)
##   x1   x2 x3  y
## 1  1    a  6 10
## 2 NA    b  7 20
## 3  3 <NA>  8 30
#   x1   x2 x3  y
# 1  1    a  6 10
# 2 NA    b  7 20
# 3  3 <NA>  8 30
plan3 <- vtreat::designTreatmentsN(d, 
                                   varlist = c("x1", "x2", "x3"),
                                   outcomename = "y",
                                   codeRestriction = "catN",
                                   verbose = FALSE)
vtreat::prepare(plan3, d)
##   x2_catN  y
## 1     -10 10
## 2       0 20
## 3      10 30
#   x2_catN  y
# 1     -10 10
# 2       0 20
# 3      10 30

00279_informalexample_8.24_of_section_8.6.4.R

# informalexample 8.24 of section 8.6.4 
# (informalexample 8.24 of section 8.6.4)  : Advanced data preparation : Mastering the vtreat package : Impact coding 

plan4 <- vtreat::designTreatmentsC(d, 
                                   varlist = c("x1", "x2", "x3"),
                                   outcomename = "y",
                                   outcometarget = 20,
                                   codeRestriction = "catB",
                                   verbose = FALSE)
vtreat::prepare(plan4, d)
##     x2_catB  y
## 1 -8.517343 10
## 2  9.903538 20
## 3 -8.517343 30
#     x2_catB  y
# 1 -8.517343 10
# 2  9.903538 20
# 3 -8.517343 30

00280_informalexample_8.25_of_section_8.6.5.R

# informalexample 8.25 of section 8.6.5 
# (informalexample 8.25 of section 8.6.5)  : Advanced data preparation : Mastering the vtreat package : The treatment plan 

class(plan4)
## [1] "treatmentplan"
# [1] "treatmentplan"

names(plan4)
## [1] "treatments"    "scoreFrame"    "outcomename"   "vtreatVersion"
## [5] "outcomeType"   "outcomeTarget" "meanY"         "splitmethod"
# [1] "treatments"    "scoreFrame"    "outcomename"   "vtreatVersion" "outcomeType"  
# [6] "outcomeTarget" "meanY"         "splitmethod"

00281_informalexample_8.26_of_section_8.6.5.R

# informalexample 8.26 of section 8.6.5 
# (informalexample 8.26 of section 8.6.5)  : Advanced data preparation : Mastering the vtreat package : The treatment plan 

plan4$scoreFrame
##   varName varMoves rsq       sig needsSplit extraModelDegrees origName
## 1 x2_catB     TRUE   1 0.0506719       TRUE                 2       x2
##   code
## 1 catB
#   varName varMoves rsq       sig needsSplit extraModelDegrees origName code
# 1 x2_catB     TRUE   1 0.0506719       TRUE                 2       x2 catB

00282_example_8.8_of_section_8.6.6.R

# example 8.8 of section 8.6.6 
# (example 8.8 of section 8.6.6)  : Advanced data preparation : Mastering the vtreat package : The cross-frame 
# Title: An information-free dataset 

set.seed(2019)                                               	# Note: 1 

d <- data.frame(                                             	# Note: 2 
  x_bad = sample(letters, 100, replace = TRUE),
  y = rnorm(100),
  stringsAsFactors = FALSE
)
d$x_good <- ifelse(d$y > rnorm(100), "non-neg", "neg") 	# Note: 3 


head(d) 	# Note: 4 
##   x_bad          y  x_good
## 1     y -0.7603575 non-neg
## 2     j  0.4442418 non-neg
## 3     e  1.7386856 non-neg
## 4     m -0.7752029 non-neg
## 5     q -1.1825636     neg
## 6     x -0.3140285 non-neg
#   x_bad           y  x_good
# 1     u -0.05294738 non-neg
# 2     s -0.23639840     neg
# 3     h -0.33796351 non-neg
# 4     q -0.75548467 non-neg
# 5     b -0.86159347     neg
# 6     b -0.52766549 non-neg

# Note 1: 
#   Set pseudo-random number generator seed to make example reproducible. 

# Note 2: 
#   Build example data where there is no relation between x_bad and y. 

# Note 3: 
#   x_good is a noisy prediction of the sign of y, so it does have some information about y. 

# Note 4: 
#   Take a look at our synthetic example data. The idea is: y is related to x_good in a noisy fashion, but unrelated to x_bad.  
#   In this case, we know what variables should be chosen, so we can tell if our acceptance procedure is working correctly. 

00283_example_8.9_of_section_8.6.6.R

# example 8.9 of section 8.6.6 
# (example 8.9 of section 8.6.6)  : Advanced data preparation : Mastering the vtreat package : The cross-frame 
# Title: The dangers of re-using data 

plan5 <- vtreat::designTreatmentsN(d,                      	# Note: 1 
                                   varlist = c("x_bad", "x_good"),
                                   outcomename = "y",
                                   codeRestriction = "catN",
                                   minFraction = 2,
                                   verbose = FALSE)

class(plan5)
## [1] "treatmentplan"
# [1] "treatmentplan"

print(plan5)                                                	# Note: 2 
##   origName     varName code         rsq          sig extraModelDegrees
## 1    x_bad  x_bad_catN catN 9.93962e-05 9.215776e-01                25
## 2   x_good x_good_catN catN 3.25805e-01 5.631959e-10                 1
#   origName     varName code          rsq          sig extraModelDegrees
# 1    x_bad  x_bad_catN catN 4.906903e-05 9.448548e-01                24
# 2   x_good x_good_catN catN 2.602702e-01 5.895285e-08                 1

training_data1 <- vtreat::prepare(plan5, d)                  	# Note: 3 
                                
res1 <- vtreat::patch_columns_into_frame(d, training_data1)    	# Note: 4 
head(res1)
##   x_bad  x_good x_bad_catN x_good_catN          y
## 1     y non-neg  0.2879701    0.523549 -0.7603575
## 2     j non-neg -0.1957772    0.523549  0.4442418
## 3     e non-neg  0.1720338    0.523549  1.7386856
## 4     m non-neg -0.3295562    0.523549 -0.7752029
## 5     q     neg -0.5767783   -0.590385 -1.1825636
## 6     x non-neg -0.2259024    0.523549 -0.3140285
#   x_bad  x_good x_bad_catN x_good_catN           y
# 1     u non-neg  0.4070979   0.4305195 -0.05294738
# 2     s     neg -0.1133011  -0.5706886 -0.23639840
# 3     h non-neg -0.3202346   0.4305195 -0.33796351
# 4     q non-neg -0.5447443   0.4305195 -0.75548467
# 5     b     neg -0.3890076  -0.5706886 -0.86159347
# 6     b non-neg -0.3890076   0.4305195 -0.52766549

sigr::wrapFTest(res1, "x_good_catN", "y")              	# Note: 5 
## [1] "F Test summary: (R2=0.3241, F(1,98)=46.98, p<1e-05)."
# [1] "F Test summary: (R2=0.2717, F(1,98)=36.56, p<1e-05)."

sigr::wrapFTest(res1, "x_bad_catN", "y")               	# Note: 6 
## [1] "F Test summary: (R2=0.1943, F(1,98)=23.64, p<1e-05)."
# [1] "F Test summary: (R2=0.2342, F(1,98)=29.97, p<1e-05)."

# Note 1: 
#   Design a variable treatment plan using x_bad and x_good to try predicting y. 

# Note 2: 
#   Notice the derived variable x_good_catN comes out as having a significant signal, and x_bad_catN does not.  
#   This is due to the proper use of cross-validation in the vtreat quality estimates. 

# Note 3: 
#   Call prepare() on the same data used to design the treatment 
#   plan—this is not always safe, as we shall 
#   see. 

# Note 4: 
#   Combine the data frames d an training_data1, using training_data1 when there are columns with duplicate names. 

# Note 5: 
#   Use a statistical F-test to check the predictive power of x_good_catN. 

# Note 6: 
#   x_bad_catN’s F-test is inflated and falsely looks 
#   significant, this is due to failure to use cross validated methods. 

00284_example_8.10_of_section_8.6.6.R

# example 8.10 of section 8.6.6 
# (example 8.10 of section 8.6.6)  : Advanced data preparation : Mastering the vtreat package : The cross-frame 
# Title: Using mkCrossFrameNExperiment() 

cfe <- vtreat::mkCrossFrameNExperiment(d, 
                                       varlist = c("x_bad", "x_good"),
                                       outcomename = "y",
                                       codeRestriction = "catN",
                                       minFraction = 2,
                                       verbose = FALSE)
plan6 <- cfe$treatments

training_data2 <- cfe$crossFrame
res2 <- vtreat::patch_columns_into_frame(d, training_data2)

head(res2)
##   x_bad  x_good x_bad_catN x_good_catN          y
## 1     y non-neg  0.7808611   0.4535235 -0.7603575
## 2     j non-neg -1.4621185   0.4535235  0.4442418
## 3     e non-neg -0.4041591   0.5424189  1.7386856
## 4     m non-neg -0.2240972   0.5772362 -0.7752029
## 5     q     neg -1.1043780  -0.7107558 -1.1825636
## 6     x non-neg  0.0000000   0.5424189 -0.3140285
#   x_bad  x_good x_bad_catN x_good_catN           y
# 1     u non-neg  0.2834739   0.4193180 -0.05294738
# 2     s     neg -0.1085887  -0.6212118 -0.23639840
# 3     h non-neg  0.0000000   0.5095586 -0.33796351
# 4     q non-neg -0.5142570   0.5095586 -0.75548467
# 5     b     neg -0.3540889  -0.6212118 -0.86159347
# 6     b non-neg -0.3540889   0.4193180 -0.52766549

sigr::wrapFTest(res2, "x_bad_catN", "y")
## [1] "F Test summary: (R2=-0.6653, F(1,98)=-39.15, p=n.s.)."
# [1] "F Test summary: (R2=-0.1389, F(1,98)=-11.95, p=n.s.)."

sigr::wrapFTest(res2, "x_good_catN", "y")
## [1] "F Test summary: (R2=0.3039, F(1,98)=42.78, p<1e-05)."
# [1] "F Test summary: (R2=0.2532, F(1,98)=33.22, p<1e-05)."

plan6$scoreFrame                                               	# Note: 1 
##       varName varMoves       rsq          sig needsSplit extraModelDegrees
## 1  x_bad_catN     TRUE 0.0806097 4.201189e-03       TRUE                25
## 2 x_good_catN     TRUE 0.3117822 1.576683e-09       TRUE                 1
##   origName code
## 1    x_bad catN
## 2   x_good catN
#       varName varMoves        rsq          sig needsSplit
# 1  x_bad_catN     TRUE 0.01436145 2.349865e-01       TRUE
# 2 x_good_catN     TRUE 0.26478467 4.332649e-08       TRUE
#   extraModelDegrees origName code
# 1                24    x_bad catN
# 2                 1   x_good catN

# Note 1: 
#   The F-tests on the data and the scoreFrame statistics now largely agree.