/
time_based_validation.R
executable file
·153 lines (118 loc) · 6.74 KB
/
time_based_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
##### code copyright July 2015
##### written by Oliver Haimson, with Michael Madaio, Xiang Cheng and Wenwen Zhang
##### on behalf of Data Science for Social Good - Atlanta
##### for Atlanta Fire Rescue Department
##### contact: ohaimson@uci.edu
cat("Running...\n\n")
# you only need to install these packages once, but you need to load them every time you run the code
#install required packages
if ("kernlab" %in% rownames(installed.packages()) == FALSE) {suppressMessages(install.packages("kernlab"))}
if ("ROCR" %in% rownames(installed.packages()) == FALSE) {suppressMessages(install.packages("ROCR"))}
#load required packages
suppressMessages(library(kernlab))
suppressMessages(library(ROCR))
## Create data frame of independent variables for model ########
IV = data.frame(NPU, SiteZip, Submarket1, TAX_DISTR, NBHD, ZONING_NUM, building_c, PROP_CLASS, Existing_p, PropertyTy, secondaryT, LUC, Taxes_Per_, Taxes_Tota, TOT_APPR, VAL_ACRES, For_Sale_P, Last_Sale1, yearbuilt, year_reno, Lot_Condition, Structure_Condition, Sidewalks, Multiple_Violations, Vacancy_pc, Total_Avai, Percent_Le, LandArea_a, totalbuild, avg_sf, Floorsize, BldgSF, LotSize, Style, stories, STRUCT_FLR, num_units, LIV_UNITS, UNIT_NUM, construct_, Sprinklers, Star_Ratin, Market_Seg, Bedrooms, Bathrooms, Owner_Name, huden10, empden10, entro10, foursqmi10, ViolentCrime_Den, PropertyCrime_Den, Pct_white, pct_blk, owner_distance, owner_public, Multiple_A, Inspection)
################################################################################
##
## TRAIN A MODEL USING DATA FROM 2011-2014
## TEST IT ON THE LAST YEAR OF DATA, 2014-2015
##
## (We get time info from the pfire and lfire variables, calculated in the
## data_cleaning script from the data file. To change time periods, a new
## data file must be created.)
##
################################################################################
# Here you can adjust how many samples you want to run. This bootstrapping takes a long time to run, so only run 10 folds if you have some time to spare.
# Plotting for ROC curve only works if samples is a multiple of 5
samples = 10
accuracyList = c()
truePosList = c()
trueNegList = c()
falsePosList = c()
falseNegList = c()
AUCList = c()
alist = c(); blist = c(); clist = c(); dlist = c()
# Set up the window to plot the ROC curves for each fold
# This only works if samples is a multiple of 5
dev.new(width=11, height=samples/2)
par(mfrow=c(samples/5, 5))
# Find percentage of prior fires / prior fires + last-year fires
num_pfire = length(pfire[pfire==1])
num_lfire = length(lfire[lfire==1])
perc_pfire = num_pfire/(num_pfire+num_lfire)
perc_lfire = num_lfire/(num_pfire+num_lfire)
for (i in 1:samples) {
# TRAIN data with fires prior to last year of data plus perc_pfire of non-fire data randomly
# data without fires
noFire_ind = which(gfire!=1)
sampleSize = floor(perc_pfire*length(noFire_ind))
train_ind_ind = sample(seq_len(length(noFire_ind)), size=sampleSize)
train_ind = noFire_ind[train_ind_ind]
train_ind2 = append(train_ind, which(pfire==1))
train = IV[train_ind2,]
# TEST data with fires in last year of data plus perc_lfire of non-fire data randomly
test_ind = noFire_ind[-train_ind_ind]
test_ind2 = append(test_ind, which(lfire==1))
test = IV[test_ind2,]
trainFire = gfire[train_ind2]
testFire = gfire[test_ind2]
trFit <- ksvm(trainFire~., data=train)
pred <- predict(trFit, test, type="response")
predictions = pred
predictions[pred >= .025] <- 1
predictions[pred < .025] <- 0
# Calculate the AUC and plot the ROC curve (these are some metrics for seeing how well the model did).
# AUC
predScore <- prediction(predictions, testFire)
auc.tmp <- performance(predScore, "auc")
auc <- as.numeric(auc.tmp@y.values)
AUCList = append(AUCList, auc)
#ROC curve
perf <- performance(predScore, measure = "tpr", x.measure = "fpr")
plot(perf, lwd=2, main=paste("Sample ", i, ", AUC = ", round(auc,digits=4), sep=""))
# test set metrics
t = table(testFire, predictions)
a = t[1]; b = t[3]; c = t[2]; d = t[4]
truePos = d/(c+d)
trueNeg = a/(a+b)
falsePos = b/(a+b)
falseNeg = c/(c+d)
accur = (a+d)/(a+b+c+d)
cat(paste("\nSAMPLE ", i, ":\n\n", sep="")); print(t); cat("\n"); cat(paste("accuracy: ", round(accur,digits=2))); cat("\n"); cat(paste("true positive rate: ", round(truePos,digits=4))); cat("\n"); cat(paste("true negative rate: ", round(trueNeg,digits=4))); cat("\n"); cat(paste("AUC: ", round(auc,digits=4))); cat("\n\n");
accuracyList = append(accuracyList, accur)
truePosList = append(truePosList, truePos)
trueNegList = append(trueNegList, trueNeg)
falsePosList = append(falsePosList, falsePos)
falseNegList = append(falseNegList, falseNeg)
alist = append(alist, a); blist = append(blist, b); clist = append(clist, c); dlist = append(dlist, d)
}
testAvgAccuracy = mean(accuracyList)
testAvgtruePos = mean(truePosList)
testAvgtrueNeg = mean(trueNegList)
testAvgfalsePos = mean(falsePosList)
testAvgfalseNeg = mean(falseNegList)
testAvgAUC = mean(AUCList)
aAvg = mean(alist); bAvg = mean(blist); cAvg = mean(clist); dAvg = mean(dlist)
## Plot the confusion matrix of the average results as a heat map ########
dev.new(width=6, height=6)
par(mar=c(5,5,2,2))
mat = matrix(c(testAvgfalsePos, testAvgtrueNeg, testAvgtruePos, testAvgfalseNeg), ncol=2)
image(mat, axes=F, cex.lab=1.8, ylab="Actual", xlab="Predicted")
mtext(c("0","1"), las=1, side=2, adj=2, outer=F, at=c(0,1), cex=1.8)
mtext(c("0","1"), las=1, side=1, padj=1, outer=F, at=c(0,1), cex=1.8)
text(1,1,paste("true positives\n", "(had fire;\n predicted fire)\n","\n", "n =", round(dAvg,digits=0), "\n", round(testAvgtruePos,digits=4)), cex=1.6, col="white")
text(0,0,paste("true negatives\n", "(no fire;\n predicted no fire)\n","\n", "n =", round(aAvg,digits=0), "\n", round(testAvgtrueNeg,digits=4)), cex=1.6, col="white")
text(0,1,paste("false negatives\n", "(had fire;\n predicted no fire)\n","\n","n =", round(cAvg,digits=0), "\n", round(testAvgfalseNeg,digits=4)), cex=1.6)
text(1,0,paste("false positives\n", "(no fire;\n predicted fire)\n","\n","n =", round(bAvg,digits=0), "\n", round(testAvgfalsePos,digits=4)), cex=1.6)
## See how well the model did on average, and print out a table of the results ########
accuracyList = append(accuracyList, testAvgAccuracy)
truePosList = append(truePosList, testAvgtruePos)
trueNegList = append(trueNegList, testAvgtrueNeg)
falsePosList = append(falsePosList, testAvgfalsePos)
falseNegList = append(falseNegList, testAvgfalseNeg)
AUCList = append(AUCList, testAvgAUC)
results = data.frame((c(1:folds, "TEST AVERAGE:")), accuracyList, AUCList, truePosList, trueNegList, falsePosList, falseNegList)
colnames(results) <- c("sample", "accuracy", "AUC", "true positive rate", "true negative rate", "false positive rate", "false negative rate")
print(results, row.names = FALSE)
cat("\n\nDone")