Closes #495. Columns not mentioned in .SDcols can be used in j just…

… fine.
Rdatatable · Mar 7, 2016 · 68091d8 · 68091d8
1 parent 39c9e14
commit 68091d8
Show file tree

Hide file tree

Showing 4 changed files with 102 additions and 44 deletions.
diff --git a/R/data.table.R b/R/data.table.R
@@ -723,6 +723,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     byval = NULL
     xnrow = nrow(x)
     xcols = xcolsAns = icols = icolsAns = integer()
+    othervars = character(0)
     if (missing(j)) {
         # missing(by)==TRUE was already checked above before dealing with i
         if (!length(x)) return(null.data.table())
@@ -1009,6 +1010,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                         # dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
                         ansvals = chmatch(ansvars, names(x))
                     }
+                    # fix for long standing FR/bug, #495
+                    if ( length(othervars <- intersect(av, names(x))) ) {
+                        # we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or 
+                        # DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc., 
+                        ansvars = union(ansvars, othervars)
+                        ansvals = chmatch(ansvars, names(x))
+                    }
                     # .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
                 }
             } else {
@@ -1235,11 +1243,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
 
             if (!with || missing(j)) return(alloc.col(ans))
 
-            SDenv$.SD = ans
+            SDenv$.SDall = ans
+            SDenv$.SD = if (!length(othervars)) SDenv$.SDall else shallow(SDenv$.SDall, setdiff(ansvars, othervars))
             SDenv$.N = nrow(SDenv$.SD)
 
         } else {
-            SDenv$.SD = null.data.table()   # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf.
+            SDenv$.SDall = SDenv$.SD = null.data.table()   # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf.
             SDenv$.N = if (is.null(irows)) nrow(x) else length(irows) * !identical(suppressWarnings(max(irows)), 0L)
             # Fix for #963.
             # When irows is integer(0), length(irows) = 0 will result in 0 (as expected).
@@ -1249,11 +1258,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         SDenv$.I = if (!missing(j) && ".I" %chin% av) seq_len(SDenv$.N) else 0L
         SDenv$.GRP = 1L
         setattr(SDenv$.SD,".data.table.locked",TRUE)   # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
+        setattr(SDenv$.SDall,".data.table.locked",TRUE)
         lockBinding(".SD",SDenv)
+        lockBinding(".SDall",SDenv)
         lockBinding(".N",SDenv)
         lockBinding(".I",SDenv)
         lockBinding(".GRP",SDenv)
-        for (ii in ansvars) assign(ii, SDenv$.SD[[ii]], SDenv)
+        for (ii in ansvars) assign(ii, SDenv$.SDall[[ii]], SDenv)
         # Since .SD is inside SDenv, alongside its columns as variables, R finds .SD symbol more quickly, if used.
         # There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write).
 
@@ -1335,7 +1346,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     assign("print", function(x,...){base::print(x,...);NULL}, SDenv)
     # Now ggplot2 returns data from print, we need a way to throw it away otherwise j accumulates the result
 
-    SDenv$.SD = null.data.table()  # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only
+    SDenv$.SDall = SDenv$.SD = null.data.table()  # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only
     SDenv$.N = as.integer(0)     # not 0L for the reson on next line :
     SDenv$.GRP = as.integer(1)   # oddly using 1L doesn't work reliably here! Possible R bug? TO DO: create reproducible example and report. To reproduce change to 1L and run test.data.table, test 780 fails. The assign seems ineffective and a previous value for .GRP from a previous test is retained, despite just creating a new SDenv.
 
@@ -1396,13 +1407,19 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     alloc = if (length(len__)) seq_len(max(len__)) else 0L
     SDenv$.I = alloc
     if (length(xcols)) {
-        SDenv$.SD = .Call(CsubsetDT,x,alloc,xcols)    # i.e. x[alloc, xcols, with=FALSE] but without recursive overhead
+        SDenv$.SDall = .Call(CsubsetDT,x,alloc,xcols)    # i.e. x[alloc, xcols, with=FALSE] but without recursive overhead
+        SDenv$.SD = if (!length(othervars)) SDenv$.SDall else shallow(SDenv$.SDall, setdiff(ansvars, othervars))
         # Must not shallow copy here. This is the allocation for the largest group. Since i=alloc is passed in here, it won't shallow copy, even in future. Only DT[,xvars,with=FALSE] might ever shallow copy automatically.
     }
-    if (nrow(SDenv$.SD)==0L) setattr(SDenv$.SD,"row.names",c(NA_integer_,0L))
+    if (nrow(SDenv$.SDall)==0L) {
+        setattr(SDenv$.SDall,"row.names",c(NA_integer_,0L))
+        setattr(SDenv$.SD,"row.names",c(NA_integer_,0L))
+    }
     # .set_row_names() basically other than not integer() for 0 length, otherwise dogroups has no [1] to modify to -.N
     setattr(SDenv$.SD,".data.table.locked",TRUE)   # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
+    setattr(SDenv$.SDall,".data.table.locked",TRUE)
     lockBinding(".SD",SDenv)
+    lockBinding(".SDall",SDenv)
     lockBinding(".N",SDenv)
     lockBinding(".GRP",SDenv)
     lockBinding(".I",SDenv)
@@ -1411,6 +1428,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
     GForce = FALSE
     if ( (getOption("datatable.optimize")>=1 && is.call(jsub)) || (is.name(jsub) && as.character(jsub) %chin% c(".SD",".N")) ) {  # Ability to turn off if problems or to benchmark the benefit
         # Optimization to reduce overhead of calling lapply over and over for each group
+        ansvarsnew = setdiff(ansvars, othervars)
         oldjsub = jsub
         funi = 1L # Fix for #985
         # convereted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then.
@@ -1431,14 +1449,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 if (is.character(fun)) fun = as.name(fun)
                 txt[[1L]] = fun
             }
-            ans = vector("list",length(ansvars)+1L)
+            ans = vector("list",length(ansvarsnew)+1L)
             ans[[1L]] = as.name("list")
-            for (ii in seq_along(ansvars)) {
-                txt[[2L]] = as.name(ansvars[ii])
+            for (ii in seq_along(ansvarsnew)) {
+                txt[[2L]] = as.name(ansvarsnew[ii])
                 ans[[ii+1L]] = as.call(txt)
             }
             jsub = as.call(ans)  # important no names here
-            jvnames = ansvars      # but here instead
+            jvnames = ansvarsnew      # but here instead
             list(jsub, jvnames)
             # It may seem inefficient to constuct a potentially long expression. But, consider calling
             # lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it
@@ -1454,14 +1472,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
         }
         if (is.name(jsub)) {
             if (jsub == ".SD") {
-                jsub = as.call(c(quote(list), lapply(ansvars, as.name)))
-                jvnames = ansvars
+                jsub = as.call(c(quote(list), lapply(ansvarsnew, as.name)))
+                jvnames = ansvarsnew
             }
         } else {
             if ( length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
                 # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
-                jsub = as.call(c(quote(list), lapply(ansvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
-                jvnames = ansvars
+                jsub = as.call(c(quote(list), lapply(ansvarsnew, function(x) { jsub[[2L]] = as.name(x); jsub })))
+                jvnames = ansvarsnew
             } else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) {
                 deparse_ans = .massageSD(jsub)
                 jsub = deparse_ans[[1L]]
@@ -1481,15 +1499,15 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                 any_SD = FALSE
                 jsubl = as.list.default(jsub)
                 oldjvnames = jvnames
-                jvnames = NULL           # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*lenght(ansvars) + other jvars ?? not straightforward.
+                jvnames = NULL           # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*lenght(ansvarsnew) + other jvars ?? not straightforward.
                 # Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!!
                 for (i_ in 2:length(jsubl)) {
                     this = jsub[[i_]]
                     if (is.name(this)) {
                         if (this == ".SD") { # optimise '.SD' alone
                             any_SD = TRUE
-                            jsubl[[i_]] = lapply(ansvars, as.name)
-                            jvnames = c(jvnames, ansvars)
+                            jsubl[[i_]] = lapply(ansvarsnew, as.name)
+                            jvnames = c(jvnames, ansvarsnew)
                         } else if (this == ".N") {
                             # don't optimise .I in c(.SD, .I), it's length can be > 1 
                             # only c(.SD, list(.I)) should be optimised!! .N is always length 1.
@@ -1522,8 +1540,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
                                         this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) {
                             # optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
                             any_SD = TRUE
-                            jsubl[[i_]] = lapply(ansvars, function(x) { this[[2L]] = as.name(x); this })
-                            jvnames = c(jvnames, ansvars)
+                            jsubl[[i_]] = lapply(ansvarsnew, function(x) { this[[2L]] = as.name(x); this })
+                            jvnames = c(jvnames, ansvarsnew)
                         } else if (any(all.vars(this) == ".SD")) {
                             # TODO, TO DO: revisit complex cases (as illustrated below)
                             # complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp]

diff --git a/README.md b/README.md
@@ -64,6 +64,8 @@
 
   24. `uniqueN` gains `na.rm` argument, [#1455](https://github.com/Rdatatable/data.table/issues/1455).
 
+  25. Finally, we can now refer to the columns that are not mentioned in `.SD/.SDcols` in `j` as well. For example, `DT[, .(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:V3]` works fine, [#495](https://github.com/Rdatatable/data.table/issues/495).
+
 #### BUG FIXES
 
   1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -7807,6 +7807,42 @@ test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)])))
 test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z])
 test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z])
 
+# fix for long standing FR/bug, #495
+# most likely I'm missing some tests, but we'll fix/add them as we go along.
+dt = data.table(grp=c(2,3,3,1,1,2,3), v1=1:7, v2=7:1, v3=10:16)
+test(1629.1, dt[, .SD*v1, .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
+test(1629.2, dt[, lapply(.SD, function(x) x*v1), .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
+test(1629.3, dt[, lapply(.SD, function(x) mean(x)*sum(v1)), .SDcols=v2:v3], data.table(v2=112, v3=364))
+test(1629.4, dt[, c(sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(V1=28L, v2=4, v3=13))
+test(1629.5, dt[, c(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, v2=4, v3=13))
+test(1629.6, dt[, .(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, V2=list(4,13)))
+test(1629.7, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3])
+# add/update
+dt2 = copy(dt)
+test(1629.8, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)])
+# grouping operations
+oldopts = getOption("datatable.optimize") # backup
+options(datatable.optimize = 1L) # no gforce
+test(1629.9, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL])
+ans1 = dt[, sum(v1), by=grp]
+ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3]
+test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)])
+test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp], 
+         dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp])
+test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
+# gforce
+options(datatable.optimize = Inf) # Inf
+test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
+# even more complex, shouldn't run any optimisation
+dt[, v4 := v1/2]
+test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3], 
+    dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp])
+test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp])
+options(datatable.optimize = oldopts)
+# by=.EACHI and operations with 'i'
+test(1629.16, dt[.(2:3), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3])
+test(1629.17, dt[.(2:3), c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3, on="grp"][order(V1,v2,v3)], dt[grp %in% 2:3, c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3][order(V1,v2,v3)])
+
 ##########################
 
 # TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.