Skip to content

Commit

Permalink
Merge pull request #154 from Chicago/dev
Browse files Browse the repository at this point in the history
Promoting final model to coincide with technical paper
  • Loading branch information
tomschenkjr committed Jan 10, 2018
2 parents 455e85c + 19a18d5 commit 1005bd5
Show file tree
Hide file tree
Showing 9 changed files with 192 additions and 315 deletions.
Binary file modified Data/df.Rds
Binary file not shown.
35 changes: 5 additions & 30 deletions Functions/modelEcoli.R
Expand Up @@ -5,54 +5,37 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
test_vars <- ncol(testData)
model <- randomForest(Escherichia.coli ~ .,
data = trainData[,
c(1:(train_vars - 1))])
c(1:(train_vars - 2))],
ntree = 1000,
importance = TRUE,
proxmity = TRUE)
# maxnodes = 20)
testData$predictionRF <- predict(model, testData[,c(1:(test_vars-2))])
tp <- c()
fn <- c()
tn <- c()
fp <- c()
tpUSGS <- c()
fnUSGS <- c()
tnUSGS <- c()
fpUSGS <- c()
tpr <- c()
fpr <- c()
tprUSGS <- c()
fprUSGS <- c()
precision <- c()
recall <- c()
precisionUSGS <- c()
recallUSGS <- c()
thresholds <- c()
predictions <- data.frame()
testData$actual_binary <- ifelse(testData$Escherichia.coli >= 235, 1, 0)
for (threshold in seq(threshBegin, threshEnd, 1)) {
testData$predictionRF_binary <- ifelse(testData$predictionRF >= threshold, 1, 0)
testData$USGS_binary <- ifelse(testData$Predicted.Level >= threshold, 1, 0)
testData$true_positive <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary == 1), 1, 0)
testData$true_negative <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary == 0), 1, 0)
testData$false_negative <- ifelse((testData$actual_binary == 1 & testData$predictionRF_binary == 0), 1, 0)
testData$false_positive <- ifelse((testData$actual_binary == 0 & testData$predictionRF_binary == 1), 1, 0)
testData$true_positiveUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary == 1), 1, 0)
testData$true_negativeUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary == 0), 1, 0)
testData$false_negativeUSGS <- ifelse((testData$actual_binary == 1 & testData$USGS_binary == 0), 1, 0)
testData$false_positiveUSGS <- ifelse((testData$actual_binary == 0 & testData$USGS_binary == 1), 1, 0)
tp <- c(tp, sum(testData$true_positive))
fn <- c(fn, sum(testData$false_negative))
tn <- c(tn, sum(testData$true_negative))
fp <- c(fp, sum(testData$false_positive))
tpUSGS <- c(tpUSGS, sum(testData$true_positiveUSGS))
fnUSGS <- c(fnUSGS, sum(testData$false_negativeUSGS))
tnUSGS <- c(tnUSGS, sum(testData$true_negativeUSGS))
fpUSGS <- c(fpUSGS, sum(testData$false_positiveUSGS))
tpr = c(tpr, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
fpr = c(fpr, sum(testData$false_positive) / (sum(testData$false_positive) + sum(testData$true_negative)))
tprUSGS <- c(tprUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
fprUSGS <- c(fprUSGS, sum(testData$false_positiveUSGS) / (sum(testData$false_positiveUSGS) + sum(testData$true_negativeUSGS)))
precision = c(precision, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_positive)))
recall = c(recall, sum(testData$true_positive) / (sum(testData$true_positive) + sum(testData$false_negative)))
precisionUSGS <- c(precisionUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_positiveUSGS)))
recallUSGS <- c(recallUSGS, sum(testData$true_positiveUSGS) / (sum(testData$true_positiveUSGS) + sum(testData$false_negativeUSGS)))
thresholds <- c(thresholds, threshold)
if (threshold == thresh) {
predictions <- rbind(predictions, testData)
Expand All @@ -64,20 +47,12 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
}
list("tpr"=tpr,
"fpr"=fpr,
"tprUSGS"=tprUSGS,
"fprUSGS"=fprUSGS,
"precision"=precision,
"recall"=recall,
"precisionUSGS"=precisionUSGS,
"recallUSGS"=recallUSGS,
"tp"=tp,
"fn"=fn,
"tn"=tn,
"fp"=fp,
"tpUSGS"=tpUSGS,
"fnUSGS"=fnUSGS,
"tnUSGS"=tnUSGS,
"fpUSGS"=fpUSGS,
"thresholds"=thresholds,
"predictions"=predictions)
}
116 changes: 66 additions & 50 deletions Master.R
Expand Up @@ -29,8 +29,8 @@ source("R/00_Startup.R")
df <- readRDS(paste0(getwd(),"/Data/df.Rds"))

# remove prior modeling variables before starting up a new model
keep <- list("df", "modelCurves", "modelEcoli")
rm(list=ls()[!ls() %in% keep])
# keep <- list("df", "modelCurves", "modelEcoli")
# rm(list=ls()[!ls() %in% keep])

#-------------------------------------------------------------------------------
# CHOOSE PREDICTORS
Expand All @@ -42,44 +42,51 @@ df_model <- df[, c("Escherichia.coli", #dependent variable
"Client.ID",
# "precipProbability",
# "Water.Level",
"Rogers_Escherichia.coli",
# "n12th_Escherichia.coli",
"Foster_Escherichia.coli",
# "Ohio_Escherichia.coli",
"North_Avenue_Escherichia.coli",
"n31st_Escherichia.coli",
"Leone_Escherichia.coli",
# "Albion_Escherichia.coli",
# "Rogers_Escherichia.coli",
# "Howard_Escherichia.coli",
# "n57th_Escherichia.coli",
# "n57th_Escherichia.coli",
# "n63rd_Escherichia.coli",
# "South_Shore_Escherichia.coli",
"South_Shore_Escherichia.coli",
# "Montrose_Escherichia.coli",
# "Calumet_Escherichia.coli",
# "Rainbow_Escherichia.coli",
# "Ohio_DNA.Geo.Mean",
# "North_Avenue_DNA.Geo.Mean",
"n63rd_DNA.Geo.Mean",
"South_Shore_DNA.Geo.Mean",
"Montrose_DNA.Geo.Mean",
"Calumet_DNA.Geo.Mean",
"Rainbow_DNA.Geo.Mean",
"Date", #Must use for splitting data, not included in model
"Predicted.Level" #Must use for USGS model comparison, not included in model
)]
# to run without USGS for comparison, comment out "Predicted.Level" above and uncomment next line
# df_model$Predicted.Level <- 1 #meaningless value
# "n63rd_DNA.Geo.Mean",
# "South_Shore_DNA.Geo.Mean",
# "Montrose_DNA.Geo.Mean",
# "Calumet_DNA.Geo.Mean",
# "Rainbow_DNA.Geo.Mean",
"Year", #used for splitting data
"Date" #used for splitting data
)]

finaltest <- df_model[df_model$Year == "2016",]

#-------------------------------------------------------------------------------
# CHOOSE TEST/TRAIN SETS
# You can decide whether to use kFolds cross validation or define your own sets
# If you set kFolds to TRUE, the data will be separated into 10 sets
# If you set kFolds to FALSE, the model will use trainStart, trainEnd, etc. (see below)
# CANNOT BE USED IF productionMode = TRUE
#-------------------------------------------------------------------------------

kFolds <- TRUE #If TRUE next 4 lines will not be used but cannot be commented out
trainStart <- "2006-01-01"
trainEnd <- "2015-12-31"
testStart <- "2016-01-01"
testEnd <- "2016-12-31"
kFolds <- FALSE #If TRUE next 2 lines will not be used but cannot be commented out
testYears <- c("2016")
trainYears <- c("2006", "2007", "2008", "2009","2010", "2011", "2012", "2013", "2014", "2015")
# trainYears <- trainYears[! trainYears %in% testYears]

# If productionMode is set to TRUE, a file named model.Rds will be generated
# Its used is explained at https://github.com/Chicago/clear-water-app
# Set trainStart and trainEnd to what you would like the model to train on
# testStart and testEnd must still be specified, although not applicable
# Set trainYears to what you would like the model to train on
# testYears must still be specified, although not applicable
# plots will not be accurate

productionMode <- FALSE
Expand All @@ -93,9 +100,9 @@ productionMode <- FALSE

# downsample settings
downsample <- FALSE #If FALSE comment out the next 3 lines
# highMin <- 200
# highMax <- 2500
# lowMax <- 200
highMin <- 235
highMax <- 2500
lowMax <- 235


#-------------------------------------------------------------------------------
Expand All @@ -108,24 +115,24 @@ downsample <- FALSE #If FALSE comment out the next 3 lines

excludeBeaches <- c(
# "12th",
# "31st",
"31st",
# "39th",
# "57th",
"63rd",
# "Albion",
"Calumet",
# "Foster",
"Foster",
# "Howard",
# "Jarvis",
# "Juneway",
# "Leone",
"Leone",
"Montrose",
# "North Avenue",
"North Avenue",
# "Oak Street",
# "Ohio",
"Ohio",
# "Osterman",
"Rainbow",
"Rogers",
# "Rogers",
"South Shore"
)

Expand All @@ -137,15 +144,11 @@ excludeBeaches <- c(
title1 <- paste0("ROC",
if(kFolds == TRUE) " - kFolds",
if(kFolds == FALSE) " - validate on ",
if(kFolds == FALSE) testStart,
if(kFolds == FALSE) " to ",
if(kFolds == FALSE) testEnd)
if(kFolds == FALSE) testYears)
title2 <- paste0("PR Curve",
if(kFolds == TRUE) " - kFolds",
if(kFolds == FALSE) " - validate on ",
if(kFolds == FALSE) testStart,
if(kFolds == FALSE) " to ",
if(kFolds == FALSE) testEnd)
if(kFolds == FALSE) testYears)


#-------------------------------------------------------------------------------
Expand All @@ -154,18 +157,18 @@ title2 <- paste0("PR Curve",
#-------------------------------------------------------------------------------

threshBegin <- 1
threshEnd <- 500
threshEnd <- 1000


thresh <- 235

#-------------------------------------------------------------------------------
# RUN MODEL
# Plots will generate and results will be saved in "model_summary)
# Plots will generate and results will be saved in "model_summary"
#-------------------------------------------------------------------------------

# runs all modeling code
source("R/30_model.R", print.eval=TRUE)
source("R/30_Model.R", print.eval=TRUE)

# creates a data frame with all model results
# this aggregates the folds to generate one single curve
Expand All @@ -174,18 +177,31 @@ model_summary <- plot_data %>%
group_by(thresholds) %>%
summarize(tpr = mean(tpr),
fpr = mean(fpr),
tprUSGS = mean(tprUSGS),
fprUSGS = mean(fprUSGS),
precision = mean(precision, na.rm = TRUE),
recall = mean(recall),
precisionUSGS = mean(precisionUSGS, na.rm = TRUE),
recallUSGS = mean(recallUSGS),
tp = mean(tp),
fn = mean(fn),
tn = mean(tn),
fp = mean(fp),
tpUSGS = mean(tpUSGS),
fnUSGS = mean(fnUSGS),
tnUSGS = mean(tnUSGS),
fpUSGS = mean(fpUSGS)
)
fp = mean(fp)
)

# saveRDS(model_summary, paste0("model_results/y", testYears, ".Rds"))

## final holdout validation

model <- readRDS("model.Rds")
finalthresh <- 381
finaltest <- finaltest[!finaltest$Client.ID %in% excludeBeaches,]
finaltest <- finaltest[complete.cases(finaltest),]
finaltest$prediction <- predict(model, finaltest)
finaltest$actualbin <- ifelse(finaltest$Escherichia.coli >= 235, 1, 0)
finaltest$predbin <- ifelse(finaltest$prediction >= finalthresh, 1, 0)
finaltest$tp <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 1, 1, 0)
finaltest$tn <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 0, 1, 0)
finaltest$fp <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 1, 1, 0)
finaltest$fn <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 0, 1, 0)
sum(finaltest$tp)
sum(finaltest$fn)
sum(finaltest$tn)
sum(finaltest$fp)

7 changes: 6 additions & 1 deletion R/20_Clean.R
Expand Up @@ -83,7 +83,7 @@ for(i in 1:length(beach_dup$Year)){
if ( is.na(prod(as.numeric(check$Escherichia.coli,na.rm=TRUE)))){
df[row.names(check)[1],]$Escherichia.coli<-NA
}
else{
else if (!is.na(df[row.names(check)[1],]$Escherichia.coli)){
#Produce a geomean of all the duplicated beaches and assign it to the first
#beach in a datafr),]
df[row.names(check)[1],]$Escherichia.coli <-prod(as.numeric(check$Escherichia.coli,na.rm=TRUE))^(1/length(check$Escherichia.coli))
Expand All @@ -97,6 +97,9 @@ rm(beach_dup,daily_beach_obs,mult_obs_beach,check,drop_list,i,j)
#Get rid of the duplicated rows
df<-df[!duplicated(df[,1:4]),]

#remove artifact NA rows
df <- df[!is.na(df$DayOfYear),]

#add new columns for predictor beaches.
#We will use the same-day test results for these beaches to predict other beaches
for (beach in unique(df$Client.ID)) {
Expand Down Expand Up @@ -126,6 +129,8 @@ names(df)[names(df) == "North Avenue_DNA.Geo.Mean"] <- "North_Avenue_DNA.Geo.Mea
names(df)[names(df) == "Oak Street_DNA.Geo.Mean"] <- "Oak_Street_DNA.Geo.Mean"
names(df)[names(df) == "South Shore_DNA.Geo.Mean"] <- "South_Shore_DNA.Geo.Mean"
names(df)[names(df) == "South Shore_Escherichia.coli"] <- "South_Shore_Escherichia.coli"
names(df)[names(df) == "North Avenue_Escherichia.coli"] <- "North_Avenue_Escherichia.coli"



#remove times from Date variable
Expand Down

0 comments on commit 1005bd5

Please sign in to comment.