Skip to content

Commit

Permalink
Merge pull request #156 from Chicago/dev
Browse files Browse the repository at this point in the history
Modify to accommodate changes needed for technical paper peer review
  • Loading branch information
nicklucius committed Aug 30, 2018
2 parents 1005bd5 + 86f8e33 commit dd07219
Show file tree
Hide file tree
Showing 21 changed files with 41,910 additions and 37,631 deletions.
8,008 changes: 4,081 additions & 3,927 deletions CSVs/Beach_Water_Levels.csv

Large diffs are not rendered by default.

4 changes: 4 additions & 0 deletions CSVs/cleanbeachnames.csv
Expand Up @@ -35,6 +35,7 @@ Foster,Foster,NA,Foster,7,41.9785,-87.6515,1,1,3
Foster-Beach,Foster,NA,Foster,7,41.9785,-87.6515,1,1,3
Front 1 & 2*,NA,1,NA,NA,NA,NA,NA,NA,NA
Hartigan,Hartigan,NA,Albion,5,42.0027,-87.6564,2,1,4
Hartigan (Albion),Hartigan,NA,Albion,5,42.0027,-87.6564,2,1,4
Hartigan-Beach,Hartigan,NA,Albion,5,42.0027,-87.6564,2,1,4
Hollywood,Osterman,NA,Osterman,6,41.9877,-87.6545,1,1,8
Hollywood/Osterman,Osterman,NA,Osterman,6,41.9877,-87.6545,1,1,8
Expand Down Expand Up @@ -69,8 +70,11 @@ Loyola1,NA,1,NA,NA,NA,NA,NA,NA,NA
Loyola2,NA,1,NA,NA,NA,NA,NA,NA,NA
Loyola-Beach,Loyola,NA,Leone,4,42.0131,-87.6635,1,1,4
Margaret T Burroughs,Margaret T Burroughs,NA,31st,13,41.8393,-87.6072,4,0,1
Margaret T Burroughs (31st),Margaret T Burroughs,NA,31st,13,41.8393,-87.6072,4,0,1
Marion Mahoney Griffin,Marion Mahoney Griffin,NA,Jarvis,3,42.0161,-87.6647,2,1,4
Marion-Mahoney-Griffin-Beach,Marion Mahoney Griffin,NA,Jarvis,3,42.0161,-87.6647,2,1,4
Marion Mahony Griffin (Jarvis),Marion Mahoney Griffin,NA,Jarvis,3,42.0161,-87.6647,2,1,4
Marion Mahony Griffin,Marion Mahoney Griffin,NA,Jarvis,3,42.0161,-87.6647,2,1,4
Montrose,Montrose,NA,Montrose,8,41.9655,-87.6385,1,1,5
Montrose Dog,NA,366,NA,NA,NA,NA,NA,NA,NA
Montrose-Beach,Montrose,NA,Montrose,8,41.9655,-87.6385,1,1,5
Expand Down
2 changes: 2 additions & 0 deletions CSVs/lock_openings.csv
@@ -1,4 +1,6 @@
begin_date,end_date,days_lock_open,O'Brien,CRCW,Wilmette,total_volume
10/14/2017,10/15/2017,2,0,2456.40,289.80,2746.20
7/25/2016,7/24/2016,2,0,0,34.01,34.01
6/15/2015,6/16/2015,2,0,997.5,167.2,1164.7
6/30/2014,7/1/2014,2,0,632,163,795
4/18/2013,4/19/2013,2,3185.6,6104.7,1429.2,10719.5
Expand Down
70,380 changes: 36,720 additions & 33,660 deletions CSVs/weather_data.csv

Large diffs are not rendered by default.

Binary file added Data/df-3-day.Rds
Binary file not shown.
Binary file modified Data/df.Rds
Binary file not shown.
3 changes: 2 additions & 1 deletion Functions/modelEcoli.R
Expand Up @@ -54,5 +54,6 @@ modelEcoli <- function(trainData, testData, threshBegin, threshEnd, thresh, prod
"tn"=tn,
"fp"=fp,
"thresholds"=thresholds,
"predictions"=predictions)
"predictions"=predictions,
"model"=model)
}
41 changes: 41 additions & 0 deletions Functions/shift_previous_data.R
@@ -0,0 +1,41 @@
#This function can be called with or without the vector of column names, names_of_columns_to_shift
#without the argument names_of_columns_to_shift, it defaults to all numeric columns in original_data_frame
shift_previous_data <- function(number_of_observations, original_data_frame, names_of_columns_to_shift=FALSE) {
merged_data_frame <- data.frame()
#remove rows where Client.ID == NA
clean_data_frame <- original_data_frame[!is.na(original_data_frame$Client.ID),]
#subset by year
for (year in unique(clean_data_frame$Year)) {
readings_by_year <- clean_data_frame[clean_data_frame$Year == year,]
#subset by beach within year
for (beach in unique(readings_by_year$Client.ID) ) {
readings_by_beach <- readings_by_year[readings_by_year$Client.ID == beach,]
readings_by_beach <- readings_by_beach[order(readings_by_beach$Date),]
#if no columns provided, use default (all numeric columns)
if (names_of_columns_to_shift[1] == FALSE) {
readings_by_beach_columns <- readings_by_beach[sapply(readings_by_beach, is.numeric)]
names_of_columns_to_shift <- colnames(readings_by_beach_columns)
}
#build new column names
for (column in names_of_columns_to_shift) {
new_column_name <- paste(column,number_of_observations,"daysPrior",sep=".")
new_column_values <- vector()
#build new columns
#for first n rows, use NA bc no prior data to use
for (n in 1:number_of_observations) {
new_column_values[n] <- NA
}
#add previous data to new columns
for (i in number_of_observations:nrow(readings_by_beach)) {
new_column_values <- c(new_column_values, readings_by_beach[,column][i-n])
}
#merge new columns with subsetted year/beach data
readings_by_beach <- cbind(readings_by_beach, new_column_values)
colnames(readings_by_beach)[colnames(readings_by_beach)=="new_column_values"] <- new_column_name
}
#rebuild original dataframe adding the merged data from above
merged_data_frame <- rbind(merged_data_frame, readings_by_beach)
}
}
merged_data_frame
}
41 changes: 3 additions & 38 deletions Master.R
Expand Up @@ -40,30 +40,13 @@ df <- readRDS(paste0(getwd(),"/Data/df.Rds"))
# set predictors
df_model <- df[, c("Escherichia.coli", #dependent variable
"Client.ID",
# "precipProbability",
# "Water.Level",
# "n12th_Escherichia.coli",

"Foster_Escherichia.coli",
# "Ohio_Escherichia.coli",
"North_Avenue_Escherichia.coli",
"n31st_Escherichia.coli",
"Leone_Escherichia.coli",
# "Albion_Escherichia.coli",
# "Rogers_Escherichia.coli",
# "Howard_Escherichia.coli",
# "n57th_Escherichia.coli",
# "n63rd_Escherichia.coli",
"South_Shore_Escherichia.coli",
# "Montrose_Escherichia.coli",
# "Calumet_Escherichia.coli",
# "Rainbow_Escherichia.coli",
# "Ohio_DNA.Geo.Mean",
# "North_Avenue_DNA.Geo.Mean",
# "n63rd_DNA.Geo.Mean",
# "South_Shore_DNA.Geo.Mean",
# "Montrose_DNA.Geo.Mean",
# "Calumet_DNA.Geo.Mean",
# "Rainbow_DNA.Geo.Mean",

"Year", #used for splitting data
"Date" #used for splitting data
)]
Expand Down Expand Up @@ -185,23 +168,5 @@ model_summary <- plot_data %>%
fp = mean(fp)
)

# saveRDS(model_summary, paste0("model_results/y", testYears, ".Rds"))

## final holdout validation

model <- readRDS("model.Rds")
finalthresh <- 381
finaltest <- finaltest[!finaltest$Client.ID %in% excludeBeaches,]
finaltest <- finaltest[complete.cases(finaltest),]
finaltest$prediction <- predict(model, finaltest)
finaltest$actualbin <- ifelse(finaltest$Escherichia.coli >= 235, 1, 0)
finaltest$predbin <- ifelse(finaltest$prediction >= finalthresh, 1, 0)
finaltest$tp <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 1, 1, 0)
finaltest$tn <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 0, 1, 0)
finaltest$fp <- ifelse(finaltest$actualbin == 0 & finaltest$predbin == 1, 1, 0)
finaltest$fn <- ifelse(finaltest$actualbin == 1 & finaltest$predbin == 0, 1, 0)
sum(finaltest$tp)
sum(finaltest$fn)
sum(finaltest$tn)
sum(finaltest$fp)
saveRDS(model, paste0("models/", "model-hybrid-only-2016-holdout", ".Rds"))

0 comments on commit dd07219

Please sign in to comment.