/
cross-validate.R
113 lines (91 loc) · 3.75 KB
/
cross-validate.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#####################################
######### CROSS VALIDATE ##########
#####################################
# make sure you have run source("get-data.R") at least once!
source("set-workspace.R") # takes some time to load Y
########## Prepare to run model ##########
# Dimensions
n <- nrow(Y)
m <- ncol(Y)
# Specify number of cores to parallelize across
# It's best to run this file across many cores (>5)
ncore <- 10
registerDoMC(ncore)
# Additionally, split list of taxa by number of cores
taxa.list <- split(1:m, gl(ncore, ceiling(m / ncore), length = m))
# Cross-validation across nfold folds
nfold <- 5 # typically 5 or 10
fold <- rep(1:nfold, n)[1:n]
set.seed(777)
fold <- sample(fold)
# Possible kernel bandwidths (defined by rho in the accompanying manuscript)
bw <- c(100, 200, 300, 400, 500, 1000, 10000)
# Create prediction grid T across USA
# Finer resolution grid requires increased computation time
Tgrid <- generate_grid(x = 100, y = 100)
# Define empty list to store model results
results <- vector("list", nfold)
# record time before beginning cross-validation
ptm <- proc.time()
for (f in 1:nfold) { # Begin cross-validation
print(paste("Beginning fold", f, "of", nfold))
# For fold f, split data into 80% train, 20% test
# train used for two major steps (see below)
# test will help determine performance of our model
test <- fold == f
train <- !test
Ytest <- Y[test, ]
Stest <- S[test, ]
Ytrain <- Y[train, ]
Strain <- S[train, ]
# Split training data into 80% train2, 20% test2
# train2 used for kernel smoothing & GCV
# test2 used to find q for prediction regions
set.seed(0820 * f)
test2 <- runif(sum(train)) < 0.2
train2 <- !test2
Ytest2 <- Ytrain[test2, ]
Stest2 <- Strain[test2, ]
Ytrain2 <- Ytrain[train2, ]
Strain2 <- Strain[train2, ]
# Split Ytrain2, Ytest and Ytest2 by taxa.list for parallelization
Ytrain2.list <- lapply(taxa.list, subset_columns, Ytrain2)
Ytest.list <- lapply(taxa.list, subset_columns, Ytest)
Ytest2.list <- lapply(taxa.list, subset_columns, Ytest2)
# Train model via best bw and kernel smooth
# Produces matrix of estimated occurence probabilities
print("Estimating occurence probabilities...")
M.list <- foreach(Y = Ytrain2.list) %dopar%
train_model(Y, S = Strain2, bw = bw, Tgrid = Tgrid)
rm(Ytrain2.list, Strain2)
## Find log-likelihood for every sample in test and test2
print("Calculating log-likelihood values for test set...")
llike.test <- foreach(Y = Ytest.list, M = M.list, .combine = "+") %dopar%
calculate_log_likelihood(Y, M)
rm(Ytest.list)
print("Calculating log-likelihood values for test2 set...")
llike.test2 <- foreach(Y = Ytest2.list, M = M.list, .combine = "+") %dopar%
calculate_log_likelihood(Y, M)
rm(M.list, Ytest2.list)
## Obtain predictive pmfs by normalizing log-likelihoods
pmf.test <- make_pmf(llike.test)
pmf.test2 <- make_pmf(llike.test2)
rm(llike.test, llike.test2)
## Make spatial source predictions!
Stest.hat <- Tgrid[apply(pmf.test, 2, which.max), ]
rownames(Stest.hat) <- rownames(Stest)
Stest2.hat <- Tgrid[apply(pmf.test2, 2, which.max), ]
rownames(Stest2.hat) <- rownames(Stest2)
## Keep pmfs, test sets, and train sets
print("Storing results...")
results[[f]] <- list(pmf.test = pmf.test, Stest = Stest, Stest.hat = Stest.hat,
pmf.test2 = pmf.test2, Stest2 = Stest2, Stest2.hat = Stest2.hat)
rm(pmf.test, Stest, Stest.hat, pmf.test2, Stest2, Stest2.hat)
print(paste("Finished with fold", f, "of", nfold))
}
# End cross-validation
proc.time() - ptm # elapsed time of cross-validation
# save results
print("Saving final results...")
save(list = c("results", "Tgrid"), file = "results.RData")
print("Done!")