Closes #1596 -- CJ now auto-names its input

Rdatatable · Jul 17, 2018 · 4ed2f2d · 4ed2f2d
1 parent c72a977
commit 4ed2f2d
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 29 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -5,6 +5,8 @@
 
 #### NEW FEATURES
 
+1. `CJ()` now auto-names its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Code that relies on `V1`, `V2`, ... naming will no longer work as expected. Thanks @franknarf1 for the suggestion.
+
 #### BUG FIXES
 
 #### NOTES

diff --git a/R/data.table.R b/R/data.table.R
@@ -49,31 +49,20 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
   .Call(CcopyNamedInList,x)   # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist
   # TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C.
 
-  # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top.
-  if (identical(x, list(NULL)) || identical(x, list(list())) ||
-      identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() )
-  tt <- as.list(substitute(list(...)))[-1L]  # Intention here is that data.table(X,Y) will automatically put X and Y as the column names.  For longer expressions, name the arguments to data.table(). But in a call to [.data.table, wrap in list() e.g. DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names
-  vnames = names(tt)
-  if (is.null(vnames)) vnames = rep.int("",length(x))
-  vnames[is.na(vnames)] = ""
-  novname = vnames==""
-  if (any(!novname)) {
-    if (any(vnames[!novname] == ".SD")) stop("A column may not be called .SD. That has special meaning.")
-  }
-  for (i in which(novname)) {
-    # if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.))
-    if (is.null(ncol(x[[i]]))) {
-      if ((tmp <- deparse(tt[[i]])[1L]) == make.names(tmp))
-        vnames[i] <- tmp
-    }
-  }
-  tt = vnames==""
-  if (any(tt)) vnames[tt] = paste0("V", which(tt))
-  # so now finally we have good column names. We also will use novname later to know which were explicitly supplied in the call.
   n <- length(x)
   if (n < 1L)
     return( null.data.table() )
+  # fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top.
+  if (identical(x, list(NULL)) || identical(x, list(list())) ||
+      identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() )
+  nd = name_dots(...)
+  vnames = nd$vnames
+  # We will use novname later to know which were explicitly supplied in the call.
+  novname = nd$novname
   if (length(vnames) != n) stop("logical error in vnames")
+  # cast to a list to facilitate naming of columns with dimension --
+  #   unlist() at the end automatically handles the need to "push" names
+  #   to accommodate the "new" columns
   vnames <- as.list.default(vnames)
   nrows = integer(n)          # vector of lengths of each column. may not be equal if silent repetition is required.
   numcols = integer(n)         # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table)

diff --git a/R/setkey.R b/R/setkey.R
@@ -390,13 +390,8 @@ CJ <- function(..., sorted = TRUE, unique = FALSE)
   }
   setattr(l, "row.names", .set_row_names(length(l[[1L]])))
   setattr(l, "class", c("data.table", "data.frame"))
+  setattr(l, "names", name_dots(...)$vnames)
 
-  if (is.null(vnames <- names(l)))
-    vnames = vector("character", length(l))
-  if (any(tt <- vnames == "")) {
-    vnames[tt] = paste0("V", which(tt))
-    setattr(l, "names", vnames)
-  }
   l <- alloc.col(l)  # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
   if (sorted) {
     if (!dups) setattr(l, 'sorted', names(l))

diff --git a/R/utils.R b/R/utils.R
@@ -71,3 +71,32 @@ vapply_1i <- function (x, fun, ..., use.names = TRUE) {
 
 more = function(f) system(paste("more",f))    # nocov  (just a dev helper)
 
+# helper used to auto-name columns in data.table(x, y) as x, y (and similar)
+#   from the original comment in data.table.R:
+#     Intention here is that data.table(X,Y) will automatically put X and Y
+#     as the column names.  For longer expressions, name the arguments to
+#     data.table(). But in a call to [.data.table, wrap in list() e.g.
+#     DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names
+#   naming of unnested matrices still handled by data.table()
+name_dots <- function(...) {
+  dot_sub <- as.list(substitute(list(...)))[-1L]
+  vnames = names(dot_sub)
+  l = list(...)
+  if (is.null(vnames)) vnames = rep.int("", length(l))
+  vnames[is.na(vnames)] = ""
+  novname = vnames==""
+  if (any(idx <- !novname)) {
+    if (any(vnames[idx] == ".SD")) stop("A column may not be called .SD. That has special meaning.")
+  }
+  for (i in which(novname)) {
+    # if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.))
+    if (is.null(ncol(l[[i]]))) {
+      if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp))
+        vnames[i] <- tmp
+    }
+  }
+  still_empty = vnames==""
+  if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty))
+  list(vnames = vnames,
+       novname = novname)
+}
diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -2709,7 +2709,7 @@ test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT)
 # CJ with ordered factor
 xx <- factor(letters[1:2], ordered=TRUE)
 yy <- sample(2)
-test(996, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2))))
+test(996, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2), yy = rep(base::sort.int(yy), 2))))
 
 # That CJ orders NA consistently with setkey and historically, now it doesn't use setkey.
 # NA must always come first in data.table throughout, since binary search relies on that internally.
@@ -6799,7 +6799,7 @@ test(1524, ans1, ans2)
 # 'unique =' argument for CJ, #1148
 x = c(1, 2, 1)
 y = c(5, 8, 8, 4)
-test(1525, CJ(x, y, unique=TRUE), CJ(c(1,2), c(4,5,8)))
+test(1525, CJ(x, y, unique=TRUE), CJ(x = c(1,2), y = c(4,5,8)))
 
 # `key` argument fix for `setDT` when input is already a `data.table`,  #1169
 DT <- data.table(A = 1:4, B = 5:8)