/
MICE_step.Rmd
executable file
·224 lines (162 loc) · 10.9 KB
/
MICE_step.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
---
title: "MICE Step"
output: html_notebook
---
```{r}
library(dplyr)
library(data.table)
library(zoo)
library(mice)
library(parallel)
library(reshape2)
impute_file_path = paste0("data/csv_MASTER.csv")
year_updated = 2020
# load the input file
csv_MASTER <- read.csv(impute_file_path)
# we want to start on July 2nd, 2015 and end on the 1st of July of the last year
csv_MASTER = csv_MASTER %>% dplyr::filter(as.Date(date_time) > "2015-07-01" &
as.Date(date_time) < paste0(year_updated, "-07-02"))
# version used for subregions (data starts in mid-2018 instead of mid-2015)
#csv_MASTER = csv_MASTER %>% dplyr::filter(as.Date(date_time) > "2018-09-30" &
# as.Date(date_time) < paste0(year_updated, "-10-01"))
# we want the smoothing window to be 15 x the number of years of data
clim_predictor_smoothing_window = 15 * floor(nrow(csv_MASTER)/365/24)
# remove all columns that specify the catagory of the hourly demand data
# also remove the data for the OVEC and SEC balancing authorities
pre_imputed_data = csv_MASTER %>% dplyr::select(-contains('_category'),
-OVEC,
-SEC)
# take the log of the hourly demand data
data_wide_logged = data.frame(date_time = pre_imputed_data$date_time,
pre_imputed_data %>% dplyr::select(-date_time) %>% apply(MARGIN = 2, FUN = function(x) log(x)))
# Now we derive:
# (1) the climotological predictor variable -- called "demand_clim_mean"
# (2) the lead 1 predictor -- called "demand_lead1"
# (3) the lag 1 predictor -- called "demand_lag1"
# (4) rename the concurrent demand values to predict
data_pre_impute0 = data_wide_logged %>% data.table() %>%
data.table::melt(, id.vars = c('date_time'), variable.name = "BA") %>%
dplyr::mutate(doy = lubridate::yday(date_time),
hour = lubridate::hour(date_time)) %>%
dplyr::mutate(original = 1)
leap_years = c(2016,2020,2024)
# tack on data to the front and back to remove the edge effects at the beginning and the end of each year
data_pre_impute_tack_front_end = data_pre_impute0 %>% dplyr::filter(doy <= (15-1)/2) %>%
dplyr::mutate(original = 0,
doy = ifelse(lubridate::year(date_time) %in% c(leap_years),
doy + 366,
doy + 365))
data_pre_impute_tack_back_end = data_pre_impute0 %>% dplyr::filter(ifelse(lubridate::year(date_time) %in% c(leap_years),
doy > 366 - (15-1)/2,
doy > 365 - (15-1)/2)) %>%
dplyr::mutate(original = 0,
doy = ifelse(lubridate::year(date_time) %in% c(leap_years),
doy - 366,
doy - 365))
data_pre_impute = rbind(data_pre_impute_tack_front_end,
data_pre_impute0,
data_pre_impute_tack_back_end) %>%
dplyr::arrange(BA,doy,hour) %>%
dplyr::group_by(BA,hour) %>%
dplyr::mutate(demand_clim_mean = rollapply(data = value,
width = clim_predictor_smoothing_window,
FUN = mean,
align = "center",
fill = NA,
na.rm = TRUE,
partial = TRUE)) %>%
dplyr::filter(original == 1) %>%
dplyr::arrange(BA,date_time) %>%
dplyr::group_by(BA) %>%
dplyr::mutate(demand_lag1 = lag(value, na.rm = TRUE),
demand_lead1 = lead(value, na.rm = TRUE),
date_time = as.POSIXct(date_time, tryFormats = c("%Y-%m-%d %H:%M"), tz = 'UTC')) %>%
ungroup() %>%
dplyr::select(-doy, -hour, -original)
data_pre_impute = data_pre_impute %>% dplyr::mutate(BA_clim_mean_label = paste0(BA, '_clim_mean'),
BA_concurrent_label = paste0(BA, '_conc'),
BA_lag1_label = paste0(BA, '_lag1'),
BA_lead1_label = paste0(BA, '_lead1'))
# construct the prediction matrix -- components are (1) the raw data, (2) the climatological mean for each region, (3) lag1 for each region, (4) lead1 for each region
data_pre_impute_wide = merge(merge(merge(reshape2::dcast(data = data_pre_impute, date_time ~ BA_concurrent_label, value.var = 'value', fun = mean),
reshape2::dcast(data = data_pre_impute, date_time ~ BA_clim_mean_label, value.var = 'demand_clim_mean', fun = mean),
by = 'date_time'),
reshape2::dcast(data = data_pre_impute, date_time ~ BA_lag1_label, value.var = 'demand_lag1', fun = mean),
by = 'date_time'),
reshape2::dcast(data = data_pre_impute, date_time ~ BA_lead1_label, value.var = 'demand_lead1', fun = mean),
by = 'date_time')
# let's adjust the predictor matrix
pred_mat = make.predictorMatrix(data_pre_impute_wide %>% dplyr::select(-date_time))
# (1) only use a var's own lag1 var to impute that var
pred_mat[grep(x = rownames(pred_mat), pattern = '_conc'),
grep(x = colnames(pred_mat), pattern = '_lag1')] = diag(ncol = length(grep(x = rownames(pred_mat), pattern = '_conc')),
nrow = length(grep(x = colnames(pred_mat), pattern = '_lag1')))
# (1b) only use a var's own lead1 var to impute that var
pred_mat[grep(x = rownames(pred_mat), pattern = '_conc'),
grep(x = colnames(pred_mat), pattern = '_lead1')] = diag(ncol = length(grep(x = rownames(pred_mat), pattern = '_conc')),
nrow = length(grep(x = colnames(pred_mat), pattern = '_lead1')))
# (1c) only use a var's own clim_mean var to impute that var
pred_mat[grep(x = rownames(pred_mat), pattern = '_conc'),
grep(x = colnames(pred_mat), pattern = '_clim_mean')] = diag(ncol = length(grep(x = rownames(pred_mat), pattern = '_conc')),
nrow = length(grep(x = colnames(pred_mat), pattern = '_clim_mean')))
# (2) do not use reg vars to impute lag1 or lead1 vars
pred_mat[grep(x = rownames(pred_mat), pattern = '_lag1|_lead1'),
grep(x = colnames(pred_mat), pattern = '_conc')] = 0
# (3) only use lead1 to predict lead1 vars and lag1 to predict lag1 vars
pred_mat[grep(x = colnames(pred_mat), pattern = '_lead1'),
grep(x = rownames(pred_mat), pattern = '_lag1|_clim_mean')] =
pred_mat[grep(x = colnames(pred_mat), pattern = '_lag1'),
grep(x = rownames(pred_mat), pattern = '_lead1|_clim_mean')] = 0
#(4) don't predict clim_mean or _hour_of_day_mean
pred_mat[grep(x = row.names(pred_mat), pattern = '_clim_mean'),] = 0
# This code currently uses 4 cores to parallelize MICE
# This should be changed depending on the number of cores your machine has
# impute with mice
data_imp_mice_lag_embed_par = parlmice(data = data_pre_impute_wide %>% dplyr::select(-date_time),
n.imp.core = 4,
maxit = 10,
predictorMatrix = pred_mat,
method = c('norm.boot'),
printFlag = FALSE,
cluster.seed = 222,
n.core = 4,
remove_collinear = FALSE,
maxcor = 0.9999999)
dir.create('MICE_output')
pdf(file = paste0('MICE_output/convergence_check_impute_csv_MASTER.pdf'), width = 10, height = 10)
print(
plot(data_imp_mice_lag_embed_par)
)
dev.off()
for(iimp in 1:16){
mice_lag_embed_imputations_temp = data.frame(date_time = data_pre_impute_wide$date_time,
complete(data_imp_mice_lag_embed_par, iimp) %>% dplyr::select(names(data_pre_impute_wide)[-1])) %>%
dplyr::mutate(imp_index = iimp)
if(iimp == 1){mice_lag_embed_imputations = mice_lag_embed_imputations_temp}
if(iimp > 1){mice_lag_embed_imputations = rbind(mice_lag_embed_imputations,mice_lag_embed_imputations_temp)}
print(iimp)
}
mice_lag_embed_imputations = mice_lag_embed_imputations %>% dplyr::select(-contains('_lag1'), -contains('lead1'), -contains('clim_mean')) %>%
setNames(sub("_conc", "", names(.)))
# save the mean of all imputations as integer values
mice_lag_embed_imputations_summarized = mice_lag_embed_imputations %>% reshape2::melt(, id.vars = c('date_time', 'imp_index'), variable.name = 'BA', value.name = 'demand') %>%
as.data.table() %>%
dplyr::group_by(date_time, BA) %>%
dplyr::summarise(demand = as.integer(round(mean(exp(demand), na.rm = TRUE),0))) %>%
ungroup()
mice_lag_embed_imputations_summarized_wide = mice_lag_embed_imputations_summarized %>% reshape2::dcast(date_time ~ BA, value.var = 'demand', fun = mean)
# save the mean of all MICE chains
write.csv(x = mice_lag_embed_imputations_summarized_wide,
file = 'MICE_output/mean_impute_csv_MASTER.csv')
# save all imputations as integer values
mice_lag_embed_imputations_all = mice_lag_embed_imputations %>% reshape2::melt(, id.vars = c('date_time', 'imp_index'), variable.name = 'BA', value.name = 'demand') %>%
as.data.table() %>%
dplyr::group_by(date_time, BA, imp_index) %>%
dplyr::mutate(demand = as.integer(round(exp(demand),0))) %>%
ungroup()
mice_lag_embed_imputations_all_wide = mice_lag_embed_imputations_all %>% reshape2::dcast(date_time + imp_index ~ BA, value.var = 'demand', fun = mean)
# save the individual MICE chains
write.csv(x = mice_lag_embed_imputations_all_wide,
file = 'MICE_output/all_impute_csv_MASTER.csv')
rm(list = ls())
```