From 3b3239c4b238c78fd09ad35b2d24a704922a9bea Mon Sep 17 00:00:00 2001
From: Michael Chirico <michael.chirico@grabtaxi.com>
Date: Fri, 16 Aug 2019 00:19:44 +0800
Subject: [PATCH] Closes #3496 and #3766 -- detect & block setkey from
 double-reordering identical columns

vestigial
---
 NEWS.md               |  2 ++
 R/data.table.R        | 18 +++++++++---------
 R/setkey.R            | 13 ++++++++-----
 inst/tests/tests.Rraw | 10 ++++++++++
 4 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index bc6b599c3..867664e00 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -228,6 +228,8 @@
 
 27. `integer64` sum-by-group is now properly optimized, [#1647](https://github.com/Rdatatable/data.table/issues/1647), [#3464](https://github.com/Rdatatable/data.table/issues/3464). Thanks to @mlandry22-h2o for the report.
 
+28. `setkey`/`setkeyv` would mangle the ordering of tables with memory-duplicate (i.e., sharing the same `address`) columns, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). This is fixed by forcing copies of duplicated columns before reordering. Relatedly, `[` could also produce such columns when subsetting. Thanks @kirillmayantsev and @alex46015 for reporting and @jaapwalhout and @Atrebas helping to debug&isolate the issue.
+
 #### NOTES
 
 1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
diff --git a/R/data.table.R b/R/data.table.R
index 47c672932..e926612f4 100644
--- a/R/data.table.R
+++ b/R/data.table.R
@@ -1266,25 +1266,25 @@ replace_order = function(isub, verbose, env) {
 
     jval = eval(jsub, SDenv, parent.frame())
     # copy 'jval' when required
-    # More speedup - only check + copy if irows is NULL
     # Temp fix for #921 - check address and copy *after* evaluating 'jval'
-    if (is.null(irows)) {
-      if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg.
-        jcpy = address(jval) %in% vapply_1c(SDenv$.SD, address) # %chin% errors when RHS is list()
-        if (jcpy) jval = copy(jval)
-      } else if (address(jval) == address(SDenv$.SD)) {
+    # need to check if copy needed regardless of irows status (#3766)
+    if (!is.list(jval)) { # performance improvement when i-arg is S4, but not list, #1438, Thanks @DCEmilberg.
+      jcpy = address(jval) %chin% vapply_1c(SDenv$.SD, address)
+      if (jcpy) jval = copy(jval)
+    } else {
+      if (address(jval) == address(SDenv$.SD)) {
         jval = copy(jval)
-      } else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) {
+      } else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
         for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
-      } else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) {
+      } else if (is.call(jsub) && jsub[[1L]] == "get") {
         jval = copy(jval) # fix for #1212
       }
-    } else {
       if (is.data.table(jval)) {
         setattr(jval, '.data.table.locked', NULL) # fix for #1341
         if (!truelength(jval)) alloc.col(jval)
       }
     }
+
     if (!is.null(lhs)) {
       # TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
       .Call(Cassign,x,irows,cols,newnames,jval)
diff --git a/R/setkey.R b/R/setkey.R
index fbf8accbe..0037a9ddb 100644
--- a/R/setkey.R
+++ b/R/setkey.R
@@ -102,12 +102,15 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
   }
   setattr(x,"index",NULL)   # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
   if (length(o)) {
-    if (verbose) {
-      tt = suppressMessages(system.time(.Call(Creorder,x,o)))
-      cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n")
-    } else {
-      .Call(Creorder,x,o)
+    # table has deep-duplicate columns, e.g. #3496
+    if (any(idx <- duplicated(vapply_1c(x, address)))) {
+      if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() }
+      for (j in which(idx)) set(x, NULL, j, copy(x[[j]]))
+      if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() }
     }
+    if (verbose) { last.started.at = proc.time() }
+    .Call(Creorder,x,o)
+    if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
   } else {
     if (verbose) cat("x is already ordered by these columns, no need to call reorder\n")
   } # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
index 58c3c63c4..503bdc8a7 100644
--- a/inst/tests/tests.Rraw
+++ b/inst/tests/tests.Rraw
@@ -15671,6 +15671,16 @@ if (test_bit64) {
   test(2077.06, int64_int32_match(d[, sum(i32, na.rm=TRUE), g], d[, sum(i64, na.rm=TRUE), g]))
 }
 
+# #3766 simple queries can create tables with deep-copy-identical columns
+x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1)
+test(2078.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))])
+# #3496 generally need to protect against deep-copy-idential columns in setkey
+x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
+x$b = x$a
+setDT(x)
+test(2078.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns')
+test(2078.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a'))
+
 
 ###################################
 #  Add new tests above this line  #