From 3b3239c4b238c78fd09ad35b2d24a704922a9bea Mon Sep 17 00:00:00 2001 From: Michael Chirico Date: Fri, 16 Aug 2019 00:19:44 +0800 Subject: [PATCH] Closes #3496 and #3766 -- detect & block setkey from double-reordering identical columns vestigial --- NEWS.md | 2 ++ R/data.table.R | 18 +++++++++--------- R/setkey.R | 13 ++++++++----- inst/tests/tests.Rraw | 10 ++++++++++ 4 files changed, 29 insertions(+), 14 deletions(-) diff --git a/NEWS.md b/NEWS.md index bc6b599c3..867664e00 100644 --- a/NEWS.md +++ b/NEWS.md @@ -228,6 +228,8 @@ 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report. +28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue. + #### NOTES 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below. diff --git a/R/data.table.R b/R/data.table.R index 47c672932..e926612f4 100644 --- a/R/data.table.R +++ b/R/data.table.R @@ -1266,25 +1266,25 @@ replace_order = function(isub, verbose, env) { jval = eval(jsub, SDenv, parent.frame()) # copy 'jval' when required - # More speedup - only check + copy if irows is NULL # Temp fix for #921 - check address and copy *after* evaluating 'jval' - if (is.null(irows)) { - if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. - jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list() - if (jcpy) jval = copy(jval) - } else if (address(jval) == address(SDenv$.SD)) { + # need to check if copy needed regardless of irows status (#3766) + if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg. + jcpy = address(jval) %chin% vapply_1c(SDenv$.SD, address) + if (jcpy) jval = copy(jval) + } else { + if (address(jval) == address(SDenv$.SD)) { jval = copy(jval) - } else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) { + } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) { for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]]) - } else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) { + } else if (is.call(jsub) && jsub[[1L]] == "get") { jval = copy(jval) # fix for #1212 } - } else { if (is.data.table(jval)) { setattr(jval, '.data.table.locked', NULL) # fix for #1341 if (!truelength(jval)) alloc.col(jval) } } + if (!is.null(lhs)) { # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above. .Call(Cassign,x,irows,cols,newnames,jval) diff --git a/R/setkey.R b/R/setkey.R index fbf8accbe..0037a9ddb 100644 --- a/R/setkey.R +++ b/R/setkey.R @@ -102,12 +102,15 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU } setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear. if (length(o)) { - if (verbose) { - tt = suppressMessages(system.time(.Call(Creorder,x,o))) - cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n") - } else { - .Call(Creorder,x,o) + # table has deep-duplicate columns, e.g. #3496 + if (any(idx <- duplicated(vapply_1c(x, address)))) { + if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() } + for (j in which(idx)) set(x, NULL, j, copy(x[[j]])) + if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() } } + if (verbose) { last.started.at = proc.time() } + .Call(Creorder,x,o) + if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() } } else { if (verbose) cat("x is already ordered by these columns, no need to call reorder\n") } # else empty integer() from forderv means x is already ordered by those cols, nothing to do. diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 58c3c63c4..503bdc8a7 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -15671,6 +15671,16 @@ if (test_bit64) { test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g])) } +# #3766 simple queries can create tables with deep-copy-identical columns +x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1) +test(2078.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))]) +# #3496 generally need to protect against deep-copy-idential columns in setkey +x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE) +x$b = x$a +setDT(x) +test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns') +test(2078.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a')) + ################################### # Add new tests above this line #