Merge pull request #154 from Chicago/dev

Promoting final model to coincide with technical paper
Chicago · Jan 10, 2018 · 1005bd5 · 1005bd5
2 parents 455e85c + 19a18d5
commit 1005bd5
Show file tree

Hide file tree

Showing 9 changed files with 192 additions and 315 deletions.
diff --git a/Data/df.Rds b/Data/df.Rds
diff --git a/Functions/modelEcoli.R b/Functions/modelEcoli.R
@@ -5,54 +5,37 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
   test_vars <- ncol(testData)
   model <- randomForest(Escherichia.coli ~ .,
                         data = trainData[,
-                                         c(1:(train_vars - 1))])
+                                         c(1:(train_vars - 2))],
+                        ntree = 1000,
+                        importance = TRUE,
+                        proxmity = TRUE)
+                        # maxnodes = 20)
   testData$predictionRF <- predict(model, testData[,c(1:(test_vars-2))])
   tp <- c()
   fn <- c()
   tn <- c()
   fp <- c()
-  tpUSGS <- c()
-  fnUSGS <- c()
-  tnUSGS <- c()
-  fpUSGS <- c()
   tpr <- c()
   fpr <- c()
-  tprUSGS <- c()
-  fprUSGS <- c()
   precision <- c()
   recall <- c()
-  precisionUSGS <- c()
-  recallUSGS <- c()
   thresholds <- c()
   predictions <- data.frame()
   testData$actual_binary <- ifelse(testData$Escherichia.coli >= 235, 1, 0)
   for (threshold in seq(threshBegin, threshEnd, 1)) {
     testData$predictionRF_binary <- ifelse(testData$predictionRF >= threshold, 1, 0)
-    testData$USGS_binary <- ifelse(testData$Predicted.Level >= threshold, 1, 0)
     testData$true_positive <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary  == 1), 1, 0)
     testData$true_negative <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary  == 0), 1, 0)
     testData$false_negative <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary  == 0), 1, 0)
     testData$false_positive <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary  == 1), 1, 0)
-    testData$true_positiveUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary  == 1), 1, 0)
-    testData$true_negativeUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary  == 0), 1, 0)
-    testData$false_negativeUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary  == 0), 1, 0)
-    testData$false_positiveUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary  == 1), 1, 0)
     tp <- c(tp, sum(testData$true_positive))
     fn <- c(fn, sum(testData$false_negative))
     tn <- c(tn, sum(testData$true_negative))
     fp <- c(fp, sum(testData$false_positive))
-    tpUSGS <- c(tpUSGS, sum(testData$true_positiveUSGS))
-    fnUSGS <- c(fnUSGS, sum(testData$false_negativeUSGS))
-    tnUSGS <- c(tnUSGS, sum(testData$true_negativeUSGS))
-    fpUSGS <- c(fpUSGS, sum(testData$false_positiveUSGS))
     tpr = c(tpr, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
     fpr = c(fpr, sum(testData$false_positive) / (sum(testData$false_positive) + sum(testData$true_negative)))
-    tprUSGS <- c(tprUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
-    fprUSGS <- c(fprUSGS, sum(testData$false_positiveUSGS) / (sum(testData$false_positiveUSGS) + sum(testData$true_negativeUSGS)))
     precision = c(precision, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_positive)))
     recall = c(recall, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
-    precisionUSGS <- c(precisionUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_positiveUSGS)))
-    recallUSGS <- c(recallUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
     thresholds <- c(thresholds, threshold)
     if (threshold == thresh) {
       predictions <- rbind(predictions, testData)
@@ -64,20 +47,12 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
   }
   list("tpr"=tpr,
        "fpr"=fpr,
-       "tprUSGS"=tprUSGS,
-       "fprUSGS"=fprUSGS,
        "precision"=precision,
        "recall"=recall,
-       "precisionUSGS"=precisionUSGS,
-       "recallUSGS"=recallUSGS,
        "tp"=tp,
        "fn"=fn,
        "tn"=tn,
        "fp"=fp,
-       "tpUSGS"=tpUSGS,
-       "fnUSGS"=fnUSGS,
-       "tnUSGS"=tnUSGS,
-       "fpUSGS"=fpUSGS,
        "thresholds"=thresholds,
        "predictions"=predictions)
 }
diff --git a/Master.R b/Master.R
@@ -29,8 +29,8 @@ source("R/00_Startup.R")
 df <- readRDS(paste0(getwd(),"/Data/df.Rds"))
 
 # remove prior modeling variables before starting up a new model
-keep <- list("df", "modelCurves", "modelEcoli")
-rm(list=ls()[!ls() %in% keep])
+# keep <- list("df", "modelCurves", "modelEcoli")
+# rm(list=ls()[!ls() %in% keep])
 
 #-------------------------------------------------------------------------------
 #  CHOOSE PREDICTORS
@@ -42,44 +42,51 @@ df_model <- df[, c("Escherichia.coli", #dependent variable
                    "Client.ID",
                    # "precipProbability",
                    # "Water.Level",
-                   "Rogers_Escherichia.coli",
+                   # "n12th_Escherichia.coli",
+                   "Foster_Escherichia.coli",
+                   # "Ohio_Escherichia.coli",
+                   "North_Avenue_Escherichia.coli",
+                   "n31st_Escherichia.coli",
+                   "Leone_Escherichia.coli",
+                   # "Albion_Escherichia.coli",
+                   # "Rogers_Escherichia.coli",
                    # "Howard_Escherichia.coli",
-                   # "n57th_Escherichia.coli", 
+                   # "n57th_Escherichia.coli",
                    # "n63rd_Escherichia.coli",
-                   # "South_Shore_Escherichia.coli",
+                   "South_Shore_Escherichia.coli",
                    # "Montrose_Escherichia.coli",
                    # "Calumet_Escherichia.coli",
                    # "Rainbow_Escherichia.coli",
                    # "Ohio_DNA.Geo.Mean",
                    # "North_Avenue_DNA.Geo.Mean",
-                   "n63rd_DNA.Geo.Mean",
-                   "South_Shore_DNA.Geo.Mean",
-                   "Montrose_DNA.Geo.Mean",
-                   "Calumet_DNA.Geo.Mean",
-                   "Rainbow_DNA.Geo.Mean",
-                   "Date", #Must use for splitting data, not included in model
-                   "Predicted.Level" #Must use for USGS model comparison, not included in model
-                   )]
-# to run without USGS for comparison, comment out "Predicted.Level" above and uncomment next line
-# df_model$Predicted.Level <- 1 #meaningless value
+                   # "n63rd_DNA.Geo.Mean",
+                   # "South_Shore_DNA.Geo.Mean",
+                   # "Montrose_DNA.Geo.Mean",
+                   # "Calumet_DNA.Geo.Mean",
+                   # "Rainbow_DNA.Geo.Mean",
+                   "Year", #used for splitting data
+                   "Date" #used for splitting data
+)]
+
+finaltest <- df_model[df_model$Year == "2016",]
 
 #-------------------------------------------------------------------------------
 #  CHOOSE TEST/TRAIN SETS
 #  You can decide whether to use kFolds cross validation or define your own sets
 #  If you set kFolds to TRUE, the data will be separated into 10 sets
 #  If you set kFolds to FALSE, the model will use trainStart, trainEnd, etc. (see below)
+#  CANNOT BE USED IF productionMode = TRUE
 #-------------------------------------------------------------------------------
 
-kFolds <- TRUE #If TRUE next 4 lines will not be used but cannot be commented out
-trainStart <- "2006-01-01"
-trainEnd <- "2015-12-31"
-testStart <- "2016-01-01"
-testEnd <- "2016-12-31"
+kFolds <- FALSE #If TRUE next 2 lines will not be used but cannot be commented out
+testYears <- c("2016")
+trainYears <- c("2006", "2007", "2008", "2009","2010", "2011", "2012", "2013", "2014", "2015")
+# trainYears <- trainYears[! trainYears %in% testYears]
 
 # If productionMode is set to TRUE, a file named model.Rds will be generated
 # Its used is explained at https://github.com/Chicago/clear-water-app
-# Set trainStart and trainEnd to what you would like the model to train on
-# testStart and testEnd must still be specified, although not applicable
+# Set trainYears to what you would like the model to train on
+# testYears must still be specified, although not applicable
 # plots will not be accurate
 
 productionMode <- FALSE
@@ -93,9 +100,9 @@ productionMode <- FALSE
 
 # downsample settings
 downsample <- FALSE #If FALSE comment out the next 3 lines
-# highMin <- 200
-# highMax <- 2500
-# lowMax <- 200
+highMin <- 235
+highMax <- 2500
+lowMax <- 235
 
 
 #-------------------------------------------------------------------------------
@@ -108,24 +115,24 @@ downsample <- FALSE #If FALSE comment out the next 3 lines
 
 excludeBeaches <- c(
                     # "12th",
-                    # "31st",
+                    "31st",
                     # "39th",
                     # "57th",
                     "63rd",
                     # "Albion",
                     "Calumet",
-                    # "Foster",
+                    "Foster",
                     # "Howard",
                     # "Jarvis",
                     # "Juneway",
-                    # "Leone",
+                    "Leone",
                     "Montrose",
-                    # "North Avenue",
+                    "North Avenue",
                     # "Oak Street",
-                    # "Ohio",
+                    "Ohio",
                     # "Osterman",
                     "Rainbow",
-                    "Rogers",
+                    # "Rogers",
                     "South Shore"
                     )
 
@@ -137,15 +144,11 @@ excludeBeaches <- c(
 title1 <- paste0("ROC", 
                  if(kFolds == TRUE) " - kFolds",
                  if(kFolds == FALSE) " - validate on ",
-                 if(kFolds == FALSE) testStart,
-                 if(kFolds == FALSE) " to ",
-                 if(kFolds == FALSE) testEnd)
+                 if(kFolds == FALSE) testYears)
 title2 <- paste0("PR Curve", 
                  if(kFolds == TRUE) " - kFolds",
                  if(kFolds == FALSE) " - validate on ",
-                 if(kFolds == FALSE) testStart,
-                 if(kFolds == FALSE) " to ",
-                 if(kFolds == FALSE) testEnd)
+                 if(kFolds == FALSE) testYears)
 
 
 #-------------------------------------------------------------------------------
@@ -154,18 +157,18 @@ title2 <- paste0("PR Curve",
 #-------------------------------------------------------------------------------
 
 threshBegin <- 1
-threshEnd <- 500
+threshEnd <- 1000
 
 
 thresh <- 235
 
 #-------------------------------------------------------------------------------
 #  RUN MODEL
-#  Plots will generate and results will be saved in "model_summary)
+#  Plots will generate and results will be saved in "model_summary"
 #-------------------------------------------------------------------------------
 
 # runs all modeling code
-source("R/30_model.R", print.eval=TRUE)
+source("R/30_Model.R", print.eval=TRUE)
 
 # creates a data frame with all model results
 # this aggregates the folds to generate one single curve
@@ -174,18 +177,31 @@ model_summary <- plot_data %>%
   group_by(thresholds) %>%
   summarize(tpr = mean(tpr),
             fpr = mean(fpr),
-            tprUSGS = mean(tprUSGS),
-            fprUSGS = mean(fprUSGS),
             precision = mean(precision, na.rm = TRUE),
             recall = mean(recall),
-            precisionUSGS = mean(precisionUSGS, na.rm = TRUE),
-            recallUSGS = mean(recallUSGS),
             tp = mean(tp),
             fn = mean(fn),
             tn = mean(tn),
-            fp = mean(fp),
-            tpUSGS = mean(tpUSGS),
-            fnUSGS = mean(fnUSGS),
-            tnUSGS = mean(tnUSGS),
-            fpUSGS = mean(fpUSGS)
-            )
+            fp = mean(fp)
+  )
+
+# saveRDS(model_summary, paste0("model_results/y", testYears, ".Rds"))
+
+## final holdout validation
+
+model <- readRDS("model.Rds")
+finalthresh <- 381
+finaltest <- finaltest[!finaltest$Client.ID %in% excludeBeaches,]
+finaltest <- finaltest[complete.cases(finaltest),]
+finaltest$prediction <- predict(model, finaltest)
+finaltest$actualbin <- ifelse(finaltest$Escherichia.coli >= 235, 1, 0)
+finaltest$predbin <- ifelse(finaltest$prediction >= finalthresh, 1, 0)
+finaltest$tp <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 1, 1, 0)
+finaltest$tn <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 0, 1, 0)
+finaltest$fp <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 1, 1, 0)
+finaltest$fn <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 0, 1, 0)
+sum(finaltest$tp)
+sum(finaltest$fn)
+sum(finaltest$tn)
+sum(finaltest$fp)
+
diff --git a/R/20_Clean.R b/R/20_Clean.R
@@ -83,7 +83,7 @@ for(i in 1:length(beach_dup$Year)){
     if ( is.na(prod(as.numeric(check$Escherichia.coli,na.rm=TRUE)))){
       df[row.names(check)[1],]$Escherichia.coli<-NA
     }
-    else{
+    else if (!is.na(df[row.names(check)[1],]$Escherichia.coli)){
       #Produce a geomean of all the duplicated beaches and assign it to the first
       #beach in a datafr),]
       df[row.names(check)[1],]$Escherichia.coli <-prod(as.numeric(check$Escherichia.coli,na.rm=TRUE))^(1/length(check$Escherichia.coli))
@@ -97,6 +97,9 @@ rm(beach_dup,daily_beach_obs,mult_obs_beach,check,drop_list,i,j)
 #Get rid of the duplicated rows
 df<-df[!duplicated(df[,1:4]),]
 
+#remove artifact NA rows
+df <- df[!is.na(df$DayOfYear),]
+
 #add new columns for predictor beaches.
 #We will use the same-day test results for these beaches to predict other beaches
 for (beach in unique(df$Client.ID)) {
@@ -126,6 +129,8 @@ names(df)[names(df) == "North Avenue_DNA.Geo.Mean"] <- "North_Avenue_DNA.Geo.Mea
 names(df)[names(df) == "Oak Street_DNA.Geo.Mean"] <- "Oak_Street_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_DNA.Geo.Mean"] <- "South_Shore_DNA.Geo.Mean"
 names(df)[names(df) == "South Shore_Escherichia.coli"] <- "South_Shore_Escherichia.coli"
+names(df)[names(df) == "North Avenue_Escherichia.coli"] <- "North_Avenue_Escherichia.coli"
+
 
 
 #remove times from Date variable