/
cross_validation.R
executable file
·149 lines (112 loc) · 6.27 KB
/
cross_validation.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
##### code copyright July 2015
##### written by Oliver Haimson, with Michael Madaio, Xiang Cheng and Wenwen Zhang
##### on behalf of Data Science for Social Good - Atlanta
##### for Atlanta Fire Rescue Department
##### contact: ohaimson@uci.edu
cat("Running...\n\n")
# you only need to install these packages once, but you need to load them every time you run the code
#install required packages
if ("kernlab" %in% rownames(installed.packages()) == FALSE) {suppressMessages(install.packages("kernlab"))}
if ("ROCR" %in% rownames(installed.packages()) == FALSE) {suppressMessages(install.packages("ROCR"))}
#load required packages
suppressWarnings(suppressMessages(library(kernlab)))
suppressWarnings(suppressMessages(library(ROCR)))
## Create data frame of independent variables for model ########
IV = data.frame(NPU, SiteZip, Submarket1, TAX_DISTR, NBHD, ZONING_NUM, building_c, PROP_CLASS, Existing_p, PropertyTy, secondaryT, LUC, Taxes_Per_, Taxes_Tota, TOT_APPR, VAL_ACRES, For_Sale_P, Last_Sale1, yearbuilt, year_reno, Lot_Condition, Structure_Condition, Sidewalks, Multiple_Violations, Vacancy_pc, Total_Avai, Percent_Le, LandArea_a, totalbuild, avg_sf, Floorsize, BldgSF, LotSize, Style, stories, STRUCT_FLR, num_units, LIV_UNITS, UNIT_NUM, construct_, Sprinklers, Star_Ratin, Market_Seg, Bedrooms, Bathrooms, Owner_Name, huden10, empden10, entro10, foursqmi10, ViolentCrime_Den, PropertyCrime_Den, Pct_white, pct_blk, owner_distance, owner_public, Multiple_A, Inspection)
################################################################################
##
## PERFORM 10-FOLD CROSS VALIDATION ON THE MODEL, TO SEE HOW WELL IT DOES AND
## HOW MUCH OVERFITTING IS OCCURRING
##
################################################################################
# Here you can adjust how many folds you want to run. Cross-validation takes a long time to run, so only run 10 folds if you have some time to spare.
# Plotting for ROC curve only works if folds is a multiple of 5
folds = 10
accuracyList = c()
truePosList = c()
trueNegList = c()
falsePosList = c()
falseNegList = c()
AUCList = c()
alist = c(); blist = c(); clist = c(); dlist = c()
# Split the data into k folds
testSize = floor((1/folds)*dim(IV)[1])
possibleIndices = c(1:dim(IV)[1])
testIndices = data.frame(matrix(ncol=0, nrow=testSize))
for (i in 1:folds) {
test_ind = sample(possibleIndices, size=testSize)
testIndices = data.frame(testIndices, test_ind)
possibleIndices = possibleIndices[! possibleIndices %in% testIndices[,i]]
}
# Set up the window to plot the ROC curves for each fold
# This only works if folds is a multiple of 5
dev.new(width=11, height=folds/2)
par(mfrow=c(folds/5, 5))
# Run the cross-validation
for (i in 1:folds) {
# training
test_ind = testIndices[,i]
test = IV[test_ind,]
train = IV[-test_ind,]
testFire = gfire[test_ind]
trainFire = gfire[-test_ind]
trFit <- ksvm(trainFire~., data=train)
# testing
pred <- predict(trFit, test, type="response")
predictions = pred
predictions[pred >= .025] <- 1
predictions[pred < .025] <- 0
# Calculate the AUC and plot the ROC curve (these are some metrics for seeing how well the model did).
# AUC
predScore <- prediction(predictions, testFire)
auc.tmp <- performance(predScore, "auc")
auc <- as.numeric(auc.tmp@y.values)
AUCList = append(AUCList, auc)
#ROC curve
perf <- performance(predScore, measure = "tpr", x.measure = "fpr")
plot(perf, lwd=2, main=paste("Fold ", i, ", AUC = ", round(auc,digits=4), sep=""))
# test set metrics
t = table(testFire, predictions)
a = t[1]; b = t[3]; c = t[2]; d = t[4]
truePos = d/(c+d)
trueNeg = a/(a+b)
falsePos = b/(a+b)
falseNeg = c/(c+d)
accur = (a+d)/(a+b+c+d)
cat(paste("\nFOLD ", i, ":\n\n", sep="")); print(t); cat("\n"); cat(paste("accuracy: ", round(accur,digits=2))); cat("\n"); cat(paste("true positive rate: ", round(truePos,digits=4))); cat("\n"); cat(paste("true negative rate: ", round(trueNeg,digits=4))); cat("\n"); cat(paste("AUC: ", round(auc,digits=4))); cat("\n\n");
accuracyList = append(accuracyList, accur)
truePosList = append(truePosList, truePos)
trueNegList = append(trueNegList, trueNeg)
falsePosList = append(falsePosList, falsePos)
falseNegList = append(falseNegList, falseNeg)
alist = append(alist, a); blist = append(blist, b); clist = append(clist, c); dlist = append(dlist, d)
}
testAvgAccuracy = mean(accuracyList)
testAvgtruePos = mean(truePosList)
testAvgtrueNeg = mean(trueNegList)
testAvgfalsePos = mean(falsePosList)
testAvgfalseNeg = mean(falseNegList)
testAvgAUC = mean(AUCList)
aAvg = mean(alist); bAvg = mean(blist); cAvg = mean(clist); dAvg = mean(dlist)
## Plot the confusion matrix of the average results as a heat map ########
dev.new(width=6, height=6)
par(mar=c(5,5,2,2))
mat = matrix(c(testAvgfalsePos, testAvgtrueNeg, testAvgtruePos, testAvgfalseNeg), ncol=2)
image(mat, axes=F, cex.lab=1.8, ylab="Actual", xlab="Predicted")
mtext(c("0","1"), las=1, side=2, adj=2, outer=F, at=c(0,1), cex=1.8)
mtext(c("0","1"), las=1, side=1, padj=1, outer=F, at=c(0,1), cex=1.8)
text(1,1,paste("true positives\n", "(had fire;\n predicted fire)\n","\n", "n =", round(dAvg,digits=0), "\n", round(testAvgtruePos,digits=4)), cex=1.6, col="white")
text(0,0,paste("true negatives\n", "(no fire;\n predicted no fire)\n","\n", "n =", round(aAvg,digits=0), "\n", round(testAvgtrueNeg,digits=4)), cex=1.6, col="white")
text(0,1,paste("false negatives\n", "(had fire;\n predicted no fire)\n","\n","n =", round(cAvg,digits=0), "\n", round(testAvgfalseNeg,digits=4)), cex=1.6)
text(1,0,paste("false positives\n", "(no fire;\n predicted fire)\n","\n","n =", round(bAvg,digits=0), "\n", round(testAvgfalsePos,digits=4)), cex=1.6)
## See how well the model did on average, and print out a table of the results ########
accuracyList = append(accuracyList, testAvgAccuracy)
truePosList = append(truePosList, testAvgtruePos)
trueNegList = append(trueNegList, testAvgtrueNeg)
falsePosList = append(falsePosList, testAvgfalsePos)
falseNegList = append(falseNegList, testAvgfalseNeg)
AUCList = append(AUCList, testAvgAUC)
results = data.frame((c(1:folds, "TEST AVERAGE:")), accuracyList, AUCList, truePosList, trueNegList, falsePosList, falseNegList)
colnames(results) <- c("fold", "accuracy", "AUC", "true positive rate", "true negative rate", "false positive rate", "false negative rate")
print(results, row.names = FALSE)
cat("\n\nDone")