Skip to content

Commit

Permalink
Closes #3496 and #3766 -- detect & block setkey from double-reorderin…
Browse files Browse the repository at this point in the history
…g identical columns

vestigial
  • Loading branch information
Michael Chirico committed Aug 15, 2019
1 parent a8e926a commit 3b3239c
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 14 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -228,6 +228,8 @@

27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report.

28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
18 changes: 9 additions & 9 deletions R/data.table.R
Expand Up @@ -1266,25 +1266,25 @@ replace_order = function(isub, verbose, env) {

jval = eval(jsub, SDenv, parent.frame())
# copy 'jval' when required
# More speedup - only check + copy if irows is NULL
# Temp fix for #921 - check address and copy *after* evaluating 'jval'
if (is.null(irows)) {
if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg.
jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list()
if (jcpy) jval = copy(jval)
} else if (address(jval) == address(SDenv$.SD)) {
# need to check if copy needed regardless of irows status (#3766)
if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg.
jcpy = address(jval) %chin% vapply_1c(SDenv$.SD, address)
if (jcpy) jval = copy(jval)
} else {
if (address(jval) == address(SDenv$.SD)) {
jval = copy(jval)
} else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) {
} else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
} else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) {
} else if (is.call(jsub) && jsub[[1L]] == "get") {
jval = copy(jval) # fix for #1212
}
} else {
if (is.data.table(jval)) {
setattr(jval, '.data.table.locked', NULL) # fix for #1341
if (!truelength(jval)) alloc.col(jval)
}
}

if (!is.null(lhs)) {
# TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
.Call(Cassign,x,irows,cols,newnames,jval)
Expand Down
13 changes: 8 additions & 5 deletions R/setkey.R
Expand Up @@ -102,12 +102,15 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
}
setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
if (length(o)) {
if (verbose) {
tt = suppressMessages(system.time(.Call(Creorder,x,o)))
cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n")
} else {
.Call(Creorder,x,o)
# table has deep-duplicate columns, e.g. #3496
if (any(idx <- duplicated(vapply_1c(x, address)))) {
if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() }
for (j in which(idx)) set(x, NULL, j, copy(x[[j]]))
if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() }
}
if (verbose) { last.started.at = proc.time() }
.Call(Creorder,x,o)
if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
} else {
if (verbose) cat("x is already ordered by these columns, no need to call reorder\n")
} # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
Expand Down
10 changes: 10 additions & 0 deletions inst/tests/tests.Rraw
Expand Up @@ -15671,6 +15671,16 @@ if (test_bit64) {
test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g]))
}

# #3766 simple queries can create tables with deep-copy-identical columns
x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1)
test(2078.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))])
# #3496 generally need to protect against deep-copy-idential columns in setkey
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
setDT(x)
test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns')
test(2078.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a'))


###################################
# Add new tests above this line #
Expand Down

0 comments on commit 3b3239c

Please sign in to comment.