-
Notifications
You must be signed in to change notification settings - Fork 968
/
setkey.R
396 lines (369 loc) · 20.4 KB
/
setkey.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
setkey = function(x, ..., verbose=getOption("datatable.verbose"), physical=TRUE)
{
if (is.character(x)) stop("x may no longer be the character name of the data.table. The possibility was undocumented and has been removed.")
cols = as.character(substitute(list(...))[-1L])
if (!length(cols)) { cols=colnames(x) }
else if (identical(cols,"NULL")) cols=NULL
setkeyv(x, cols, verbose=verbose, physical=physical)
}
# FR #1442
setindex = function(...) setkey(..., physical=FALSE)
setindexv = function(x, cols, verbose=getOption("datatable.verbose")) {
if (is.list(cols)) {
sapply(cols, setkeyv, x=x, verbose=verbose, physical=FALSE)
return(invisible(x))
} else {
setkeyv(x, cols, verbose=verbose, physical=FALSE)
}
}
# remove these 3 after May 2019; see discussion in #3399 and notes in v1.12.2. They were marked experimental after all.
set2key = function(...) stop("set2key() is now deprecated. Please use setindex() instead.")
set2keyv = function(...) stop("set2keyv() is now deprecated. Please use setindexv() instead.")
key2 = function(...) stop("key2() is now deprecated. Please use indices() instead.")
# upgrade to error after Mar 2020. Has already been warning since 2012, and stronger warning in Mar 2019 (note in news for 1.12.2); #3399
"key<-" = function(x,value) {
warning("key(x)<-value is deprecated and not supported. Please change to use setkey() with perhaps copy(). Has been warning since 2012 and will be an error in future.")
setkeyv(x,value)
# The returned value here from key<- is then copied by R before assigning to x, it seems. That's
# why we can't do anything about it without a change in R itself. If we return NULL (or invisible()) from this key<-
# method, the table gets set to NULL. So, although we call setkeyv(x,cols) here, and that doesn't copy, the
# returned value (x) then gets copied by R.
# So, solution is that caller has to call setkey or setkeyv directly themselves, to avoid <- dispatch and its copy.
}
setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRUE)
{
if (is.null(cols)) { # this is done on a data.frame when !cedta at top of [.data.table
if (physical) setattr(x,"sorted",NULL)
setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys.
return(invisible(x))
}
if (!is.data.table(x)) stop("x is not a data.table")
if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.")
if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.")
if (!length(cols)) {
warning("cols is a character vector of zero length. Removed the key, but use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
setattr(x,"sorted",NULL)
return(invisible(x))
}
if (identical(cols,"")) stop("cols is the empty string. Use NULL to remove the key.")
if (!all(nzchar(cols))) stop("cols contains some blanks.")
cols = gsub("`", "", cols, fixed = TRUE)
miss = !(cols %chin% colnames(x))
if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
## determine, whether key is already present:
if (identical(key(x),cols)) {
if (!physical) {
## create index as integer() because already sorted by those columns
if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
}
return(invisible(x))
} else if(identical(head(key(x), length(cols)), cols)){
if (!physical) {
## create index as integer() because already sorted by those columns
if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), integer())
} else {
## key is present but x has a longer key. No sorting needed, only attribute is changed to shorter key.
setattr(x,"sorted",cols)
}
return(invisible(x))
}
if (".xi" %chin% names(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
for (i in cols) {
.xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column
if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported as a key column type, currently.")
}
if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
newkey = paste0(cols, collapse="__")
if (!any(indices(x) == newkey)) {
if (verbose) {
tt = suppressMessages(system.time(o <- forderv(x, cols, sort=TRUE, retGrp=FALSE))) # system.time does a gc, so we don't want this always on, until refcnt is on by default in R
# suppress needed for tests 644 and 645 in verbose mode
cat("forder took", tt["user.self"]+tt["sys.self"], "sec\n")
} else {
o = forderv(x, cols, sort=TRUE, retGrp=FALSE)
}
} else {
if (verbose) cat("setkey on columns ", brackify(cols), " using existing index '", newkey, "'\n", sep="")
o = getindex(x, newkey)
}
if (!physical) {
if (is.null(attr(x, "index", exact=TRUE))) setattr(x, "index", integer())
setattr(attr(x, "index", exact=TRUE), paste0("__", cols, collapse=""), o)
return(invisible(x))
}
setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
if (length(o)) {
# table has deep-duplicate columns, e.g. #3496
if (any(idx <- duplicated(vapply_1c(x, address)))) {
if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() }
for (j in which(idx)) set(x, NULL, j, copy(x[[j]]))
if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() }
}
if (verbose) { last.started.at = proc.time() }
.Call(Creorder,x,o)
if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
} else {
if (verbose) cat("x is already ordered by these columns, no need to call reorder\n")
} # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
setattr(x,"sorted",cols)
invisible(x)
}
key = function(x) attr(x, "sorted", exact=TRUE)
indices = function(x, vectors = FALSE) {
ans = names(attributes(attr(x, "index", exact=TRUE)))
if (is.null(ans)) return(ans) # otherwise character() gets returned by next line
ans = gsub("^__","",ans) # the leading __ is internal only, so remove that in result
if (isTRUE(vectors))
ans = strsplit(ans, "__", fixed = TRUE)
ans
}
getindex = function(x, name) {
# name can be "col", or "col1__col2", or c("col1","col2")
ans = attr(attr(x, 'index', exact=TRUE), paste0("__",name,collapse=""), exact=TRUE)
if (!is.null(ans) && (!is.integer(ans) || (length(ans)!=nrow(x) && length(ans)!=0L))) {
stop("Internal error: index '",name,"' exists but is invalid") # nocov
}
ans
}
haskey = function(x) !is.null(key(x))
# reorder a vector based on 'order' (integer)
# to be used in fastorder instead of x[o], but in general, it's better to replace vector subsetting with this..?
# Basic checks that all items of order are in range 1:n with no NAs are now made inside Creorder.
# FOR INTERNAL USE ONLY
setreordervec = function(x, order) .Call(Creorder, x, order)
# sort = sort.int = sort.list = order = is.unsorted = function(...)
# stop("Should never be called by data.table internals. Use is.sorted() on vectors, or forder() for lists and vectors.")
# Nice idea, but users might use these in i or j e.g. blocking order caused tests 304 to fail.
# Maybe just a grep through *.R for use of these function internally would be better (TO DO).
# Don't use base::is.unsorted internally, because :
# 1) it uses locale whereas in data.table we control locale sorting independently (C locale currently, but
# "sorted" attribute will need an extra attribute "locale" so we can check if key's locale is the current locale)
# 2) wrapper needed, used to be :
# identical(FALSE,is.unsorted(x)) && !(length(x)==1 && is.na(x))
# where the && was needed to maintain backwards compatibility after r-devel's change of is.unsorted(NA) to FALSE (was NA) [May 2013].
# The others (order, sort.int etc) are turned off to protect ourselves from using them internally, for speed and for
# consistency; e.g., consistent twiddling of numeric/integer64, NA at the beginning of integer, locale ordering of character vectors.
is.sorted = function(x, by=seq_along(x)) {
if (is.list(x)) {
warning("Use 'if (length(o <- forderv(DT,by))) ...' for efficiency in one step, so you have o as well if not sorted.")
# could pass through a flag for forderv to return early on first FALSE. But we don't need that internally
# since internally we always then need ordering, an it's better in one step. Don't want inefficiency to creep in.
# This is only here for user/debugging use to check/test valid keys; e.g. data.table:::is.sorted(DT,by)
0L == length(forderv(x,by,retGrp=FALSE,sort=TRUE))
} else {
if (!missing(by)) stop("x is vector but 'by' is supplied")
.Call(Cfsorted, x)
}
# Cfsorted could be named CfIsSorted, but since "sorted" is an adjective not verb, it's clear; e.g., Cfsort would sort it ("sort" is verb).
# Return value of TRUE/FALSE is relied on in [.data.table quite a bit on vectors. Simple. Stick with that (rather than -1/0/+1)
# Important to call forder.c::fsorted here, for consistent character ordering and numeric/integer64 twiddling.
}
ORDERING_TYPES = c('logical', 'integer', 'double', 'complex', 'character')
forderv = function(x, by=seq_along(x), retGrp=FALSE, sort=TRUE, order=1L, na.last=FALSE)
{
if (!(sort || retGrp)) stop("At least one of retGrp or sort must be TRUE")
na.last = as.logical(na.last)
if (!length(na.last)) stop('length(na.last) = 0')
if (length(na.last) != 1L) {
warning("length(na.last) > 1, only the first element will be used")
na.last = na.last[1L]
}
# TO DO: export and document forder
if (is.atomic(x)) {
if (!missing(by) && !is.null(by)) stop("x is a single vector, non-NULL 'by' doesn't make sense")
by = NULL
if ( !missing(order) && (length(order) != 1L || !(order %in% c(1L, -1L))) )
stop("x is a single vector, length(order) must be =1 and it's value should be 1 (ascending) or -1 (descending).")
} else {
if (!length(x)) return(integer(0L)) # to be consistent with base::order. this'll make sure forderv(NULL) will result in error
# (as base does) but forderv(data.table(NULL)) and forderv(list()) will return integer(0L))
by = colnamesInt(x, by, check_dups=FALSE)
if ( (length(order) != 1L && length(order) != length(by)) || !all(order %in% c(1L, -1L)) )
stop("x is a list, length(order) must be either =1 or =length(by) and each value should be 1 or -1 for each column in 'by', corresponding to ascending or descending order, respectively. If length(order) == 1, it will be recycled to length(by).")
if (length(order) == 1L) order = rep(order, length(by))
}
order = as.integer(order)
.Call(Cforder, x, by, retGrp, sort, order, na.last) # returns integer() if already sorted, regardless of sort=TRUE|FALSE
}
forder = function(x, ..., na.last=TRUE, decreasing=FALSE)
{
if (!is.data.table(x)) stop("x must be a data.table.")
if (ncol(x) == 0L) stop("Attempting to order a 0-column data.table.")
if (is.na(decreasing) || !is.logical(decreasing)) stop("'decreasing' must be logical TRUE or FALSE")
cols = substitute(list(...))[-1L]
if (identical(as.character(cols),"NULL") || !length(cols) || (length(cols) == 1L && !nzchar(cols))) return(NULL) # to provide the same output as base::order
ans = x
order = rep(1L, length(cols))
if (length(cols)) {
ans = vector("list", length(cols))
cols = as.list(cols)
xcols = names(x)
for (i in seq_along(cols)) {
v=cols[[i]]
if (i == 1L && is.call(v) && length(v) == 2L && v[[1L]] == "list") return(1L) # to be consistent with base, see comment below under while loop
while (is.call(v) && length(v) == 2L && v[[1L]] != "list") {
# take care of "--x", "{-x}", "(---+x)" etc., cases and also "list(y)". 'list(y)' is ambiguous though. In base, with(DT, order(x, list(y))) will error
# that 'arguments are not of same lengths'. But with(DT, order(list(x), list(y))) will return 1L, which is very strange. On top of that, with(DT,
# order(x, as.list(10:1)) would return 'unimplemented type list'. It's all very inconsistent. But we HAVE to be consistent with base HERE.
if (!as.character(v[[1L]]) %chin% c("+", "-")) break # FIX for bug #5583
if (v[[1L]] == "-") order[i] = -order[i]
v = v[[-1L]]
}
if (is.name(v)) {
ix = chmatch(as.character(v), xcols, nomatch=0L)
if (ix != 0L) ans = point(ans, i, x, ix) # see 'point' in data.table.R and C-version pointWrapper in assign.c - avoid copies
else {
v = as.call(list(as.name("list"), v))
ans = point(ans, i, eval(v, x, parent.frame()), 1L)
}
} else {
if (!is.object(eval(v, x, parent.frame()))) {
v = as.call(list(as.name("list"), v))
ans = point(ans, i, eval(v, x, parent.frame()), 1L) # eval has to make a copy here (not due to list(.), but due to ex: "4-5*y"), unavoidable.
} else ans = point(ans, i, list(unlist(eval(v, x, parent.frame()))), 1L)
} # else stop("Column arguments to order by in 'forder' should be of type name/symbol (ex: quote(x)) or call (ex: quote(-x), quote(x+5*y))")
}
}
cols = seq_along(ans)
# Supported column types are checked at C level
o = forderv(ans, cols, sort=TRUE, retGrp=FALSE, order= if (decreasing) -order else order, na.last)
if (!length(o)) o = seq_along(ans[[1L]]) else o
o
}
fsort = function(x, decreasing=FALSE, na.last=FALSE, internal=FALSE, verbose=FALSE, ...)
{
containsNAs = FALSE
if (typeof(x)=="double" && !decreasing && !(containsNAs <- anyNA(x))) {
if (internal) stop("Internal code should not be being called on type double")
return(.Call(Cfsort, x, verbose))
}
else {
# fsort is now exported for testing. Trying to head off complaints "it's slow on integer"
# The only places internally we use fsort internally (3 calls, all on integer) have had internal=TRUE added for now.
# TODO: implement integer and character in Cfsort and remove this branch and warning
if (!internal){
if (typeof(x)!="double") warning("Input is not a vector of type double. New parallel sort has only been done for double vectors so far. Using one thread.")
if (decreasing) warning("New parallel sort has not been implemented for decreasing=TRUE so far. Using one thread.")
if (containsNAs) warning("New parallel sort has not been implemented for vectors containing NA values so far. Using one thread.")
}
orderArg = if (decreasing) -1 else 1
o = forderv(x, order=orderArg, na.last=na.last)
return( if (length(o)) x[o] else x )
}
}
setorder = function(x, ..., na.last=FALSE)
# na.last=FALSE here, to be consistent with data.table's default
# as opposed to DT[order(.)] where na.last=TRUE, to be consistent with base
{
if (!is.data.frame(x)) stop("x must be a data.frame or data.table.")
cols = substitute(list(...))[-1L]
if (identical(as.character(cols),"NULL")) return(x)
if (length(cols)) {
cols=as.list(cols)
order=rep(1L, length(cols))
for (i in seq_along(cols)) {
v=as.list(cols[[i]])
if (length(v) > 1L && v[[1L]] == "+") v=v[[-1L]]
else if (length(v) > 1L && v[[1L]] == "-") {
v=v[[-1L]]
order[i] = -1L
}
cols[[i]]=as.character(v)
}
cols=unlist(cols, use.names=FALSE)
} else {
cols=colnames(x)
order=rep(1L, length(cols))
}
setorderv(x, cols, order, na.last)
}
setorderv = function(x, cols = colnames(x), order=1L, na.last=FALSE)
{
if (is.null(cols)) return(x)
if (!is.data.frame(x)) stop("x must be a data.frame or data.table")
na.last = as.logical(na.last)
if (is.na(na.last) || !length(na.last)) stop('na.last must be logical TRUE/FALSE')
if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setorder.")
if (!length(cols)) {
warning("cols is a character vector of zero length. Use NULL instead, or wrap with suppressWarnings() to avoid this warning.")
return(x)
}
if (!all(nzchar(cols))) stop("cols contains some blanks.") # TODO: probably I'm checking more than necessary here.. there are checks in 'forderv' as well
# remove backticks from cols
cols = gsub("`", "", cols, fixed = TRUE)
miss = !(cols %chin% colnames(x))
if (any(miss)) stop("some columns are not in the data.table: ", paste(cols[miss], collapse=","))
if (".xi" %chin% colnames(x)) stop("x contains a column called '.xi'. Conflicts with internal use by data.table.")
for (i in cols) {
.xi = x[[i]] # [[ is copy on write, otherwise checking type would be copying each column
if (!typeof(.xi) %chin% ORDERING_TYPES) stop("Column '",i,"' is type '",typeof(.xi),"' which is not supported for ordering currently.")
}
if (!is.character(cols) || length(cols)<1L) stop("Internal error. 'cols' should be character at this point in setkey; please report.") # nocov
o = forderv(x, cols, sort=TRUE, retGrp=FALSE, order=order, na.last=na.last)
if (length(o)) {
.Call(Creorder, x, o)
if (is.data.frame(x) & !is.data.table(x)) {
setattr(x, 'row.names', rownames(x)[o])
}
k = key(x)
if (!identical(head(cols, length(k)), k) || any(head(order, length(k)) < 0))
setattr(x, 'sorted', NULL) # if 'forderv' is not 0-length & key is not a same-ordered subset of cols, it means order has changed. So, set key to NULL, else retain key.
setattr(x, 'index', NULL) # remove secondary keys too. These could be reordered and retained, but simpler and faster to remove
}
invisible(x)
}
binary = function(x) .Call(Cbinary, x)
setNumericRounding = function(x) {.Call(CsetNumericRounding, as.integer(x)); invisible()}
getNumericRounding = function() .Call(CgetNumericRounding)
SJ = function(...) {
JDT = as.data.table(list(...))
setkey(JDT)
}
# S for Sorted, usually used in i to sort the i table
# TO DO?: Use the CJ list() replication method for SJ (inside as.data.table.list?, #2109) too to avoid alloc.col
CJ = function(..., sorted = TRUE, unique = FALSE)
{
# Pass in a list of unique values, e.g. ids and dates
# Cross Join will then produce a join table with the combination of all values (cross product).
# The last vector is varied the quickest in the table, so dates should be last for roll for example
l = list(...)
if (isFALSE(getOption("datatable.CJ.names", TRUE))) { # default TRUE from v1.12.0, FALSE before. TODO: remove option in v1.13.0 as stated in news
if (is.null(vnames <- names(l))) vnames = paste0("V", seq_len(length(l)))
else if (any(tt <- vnames=="")) vnames[tt] = paste0("V", which(tt))
} else {
vnames = name_dots(...)
}
dups = FALSE # fix for #1513
for (i in seq_along(l)) {
y = l[[i]]
if (!length(y)) next
if (sorted) {
if (!is.atomic(y)) stop("'sorted' is TRUE but element ", i, " is non-atomic, which can't be sorted; try setting sorted = FALSE")
o = forderv(y, retGrp=TRUE)
thisdups = attr(o, 'maxgrpn', exact=TRUE)>1L
if (thisdups) {
dups = TRUE
if (length(o)) l[[i]] = if (unique) y[o[attr(o, "starts", exact=TRUE)]] else y[o]
else if (unique) l[[i]] = y[attr(o, "starts", exact=TRUE)] # test 1525.5
} else {
if (length(o)) l[[i]] = y[o]
}
} else {
if (unique) l[[i]] = unique(y)
}
}
nrow = prod( vapply_1i(l, length) ) # lengths(l) will work from R 3.2.0
if (nrow > .Machine$integer.max) stop("Cross product of elements provided to CJ() would result in ",nrow," rows which exceeds .Machine$integer.max == ",.Machine$integer.max)
l = .Call(Ccj, l)
setDT(l)
l = alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency since
# it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation
setnames(l, vnames)
if (sorted) {
if (!dups) setattr(l, 'sorted', names(l))
else setkey(l) # fix #1513
}
l
}