Skip to content

Commit

Permalink
Closes #495. Columns not mentioned in .SDcols can be used in j just…
Browse files Browse the repository at this point in the history
… fine.
  • Loading branch information
arunsrinivasan committed Mar 7, 2016
1 parent 39c9e14 commit 68091d8
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 44 deletions.
56 changes: 37 additions & 19 deletions R/data.table.R
Expand Up @@ -723,6 +723,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
byval = NULL
xnrow = nrow(x)
xcols = xcolsAns = icols = icolsAns = integer()
othervars = character(0)
if (missing(j)) {
# missing(by)==TRUE was already checked above before dealing with i
if (!length(x)) return(null.data.table())
Expand Down Expand Up @@ -1009,6 +1010,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
# dups = FALSE here. DT[, .SD, .SDcols=c("x", "x")] again doesn't really help with which 'x' to keep (and if '-' which x to remove)
ansvals = chmatch(ansvars, names(x))
}
# fix for long standing FR/bug, #495
if ( length(othervars <- intersect(av, names(x))) ) {
# we've a situation like DT[, c(sum(V1), lapply(.SD, mean)), by=., .SDcols=...] or
# DT[, lapply(.SD, function(x) x *v1), by=, .SDcols=...] etc.,
ansvars = union(ansvars, othervars)
ansvals = chmatch(ansvars, names(x))
}
# .SDcols might include grouping columns if users wants that, but normally we expect user not to include them in .SDcols
}
} else {
Expand Down Expand Up @@ -1235,11 +1243,12 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {

if (!with || missing(j)) return(alloc.col(ans))

SDenv$.SD = ans
SDenv$.SDall = ans
SDenv$.SD = if (!length(othervars)) SDenv$.SDall else shallow(SDenv$.SDall, setdiff(ansvars, othervars))
SDenv$.N = nrow(SDenv$.SD)

} else {
SDenv$.SD = null.data.table() # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf.
SDenv$.SDall = SDenv$.SD = null.data.table() # no columns used by j so .SD can be empty. Only needs to exist so that we can rely on it being there when locking it below for example. If .SD were used by j, of course then xvars would be the columns and we wouldn't be in this leaf.
SDenv$.N = if (is.null(irows)) nrow(x) else length(irows) * !identical(suppressWarnings(max(irows)), 0L)
# Fix for #963.
# When irows is integer(0), length(irows) = 0 will result in 0 (as expected).
Expand All @@ -1249,11 +1258,13 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
SDenv$.I = if (!missing(j) && ".I" %chin% av) seq_len(SDenv$.N) else 0L
SDenv$.GRP = 1L
setattr(SDenv$.SD,".data.table.locked",TRUE) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
setattr(SDenv$.SDall,".data.table.locked",TRUE)
lockBinding(".SD",SDenv)
lockBinding(".SDall",SDenv)
lockBinding(".N",SDenv)
lockBinding(".I",SDenv)
lockBinding(".GRP",SDenv)
for (ii in ansvars) assign(ii, SDenv$.SD[[ii]], SDenv)
for (ii in ansvars) assign(ii, SDenv$.SDall[[ii]], SDenv)
# Since .SD is inside SDenv, alongside its columns as variables, R finds .SD symbol more quickly, if used.
# There isn't a copy of the columns here, the xvar symbols point to the SD columns (copy-on-write).

Expand Down Expand Up @@ -1335,7 +1346,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
assign("print", function(x,...){base::print(x,...);NULL}, SDenv)
# Now ggplot2 returns data from print, we need a way to throw it away otherwise j accumulates the result

SDenv$.SD = null.data.table() # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only
SDenv$.SDall = SDenv$.SD = null.data.table() # e.g. test 607. Grouping still proceeds even though no .SD e.g. grouping key only tables, or where j consists of .N only
SDenv$.N = as.integer(0) # not 0L for the reson on next line :
SDenv$.GRP = as.integer(1) # oddly using 1L doesn't work reliably here! Possible R bug? TO DO: create reproducible example and report. To reproduce change to 1L and run test.data.table, test 780 fails. The assign seems ineffective and a previous value for .GRP from a previous test is retained, despite just creating a new SDenv.

Expand Down Expand Up @@ -1396,13 +1407,19 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
alloc = if (length(len__)) seq_len(max(len__)) else 0L
SDenv$.I = alloc
if (length(xcols)) {
SDenv$.SD = .Call(CsubsetDT,x,alloc,xcols) # i.e. x[alloc, xcols, with=FALSE] but without recursive overhead
SDenv$.SDall = .Call(CsubsetDT,x,alloc,xcols) # i.e. x[alloc, xcols, with=FALSE] but without recursive overhead
SDenv$.SD = if (!length(othervars)) SDenv$.SDall else shallow(SDenv$.SDall, setdiff(ansvars, othervars))
# Must not shallow copy here. This is the allocation for the largest group. Since i=alloc is passed in here, it won't shallow copy, even in future. Only DT[,xvars,with=FALSE] might ever shallow copy automatically.
}
if (nrow(SDenv$.SD)==0L) setattr(SDenv$.SD,"row.names",c(NA_integer_,0L))
if (nrow(SDenv$.SDall)==0L) {
setattr(SDenv$.SDall,"row.names",c(NA_integer_,0L))
setattr(SDenv$.SD,"row.names",c(NA_integer_,0L))
}
# .set_row_names() basically other than not integer() for 0 length, otherwise dogroups has no [1] to modify to -.N
setattr(SDenv$.SD,".data.table.locked",TRUE) # used to stop := modifying .SD via j=f(.SD), bug#1727. The more common case of j=.SD[,subcol:=1] was already caught when jsub is inspected for :=.
setattr(SDenv$.SDall,".data.table.locked",TRUE)
lockBinding(".SD",SDenv)
lockBinding(".SDall",SDenv)
lockBinding(".N",SDenv)
lockBinding(".GRP",SDenv)
lockBinding(".I",SDenv)
Expand All @@ -1411,6 +1428,7 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
GForce = FALSE
if ( (getOption("datatable.optimize")>=1 && is.call(jsub)) || (is.name(jsub) && as.character(jsub) %chin% c(".SD",".N")) ) { # Ability to turn off if problems or to benchmark the benefit
# Optimization to reduce overhead of calling lapply over and over for each group
ansvarsnew = setdiff(ansvars, othervars)
oldjsub = jsub
funi = 1L # Fix for #985
# convereted the lapply(.SD, ...) to a function and used below, easier to implement FR #2722 then.
Expand All @@ -1431,14 +1449,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
if (is.character(fun)) fun = as.name(fun)
txt[[1L]] = fun
}
ans = vector("list",length(ansvars)+1L)
ans = vector("list",length(ansvarsnew)+1L)
ans[[1L]] = as.name("list")
for (ii in seq_along(ansvars)) {
txt[[2L]] = as.name(ansvars[ii])
for (ii in seq_along(ansvarsnew)) {
txt[[2L]] = as.name(ansvarsnew[ii])
ans[[ii+1L]] = as.call(txt)
}
jsub = as.call(ans) # important no names here
jvnames = ansvars # but here instead
jvnames = ansvarsnew # but here instead
list(jsub, jvnames)
# It may seem inefficient to constuct a potentially long expression. But, consider calling
# lapply 100000 times. The C code inside lapply does the LCONS stuff anyway, every time it
Expand All @@ -1454,14 +1472,14 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
}
if (is.name(jsub)) {
if (jsub == ".SD") {
jsub = as.call(c(quote(list), lapply(ansvars, as.name)))
jvnames = ansvars
jsub = as.call(c(quote(list), lapply(ansvarsnew, as.name)))
jvnames = ansvarsnew
}
} else {
if ( length(jsub) == 3L && (jsub[[1L]] == "[" || jsub[[1L]] == "head" || jsub[[1L]] == "tail") && jsub[[2L]] == ".SD" && (is.numeric(jsub[[3L]]) || jsub[[3L]] == ".N") ) {
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
jsub = as.call(c(quote(list), lapply(ansvars, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = ansvars
jsub = as.call(c(quote(list), lapply(ansvarsnew, function(x) { jsub[[2L]] = as.name(x); jsub })))
jvnames = ansvarsnew
} else if (jsub[[1L]]=="lapply" && jsub[[2L]]==".SD" && length(xcols)) {
deparse_ans = .massageSD(jsub)
jsub = deparse_ans[[1L]]
Expand All @@ -1481,15 +1499,15 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
any_SD = FALSE
jsubl = as.list.default(jsub)
oldjvnames = jvnames
jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*lenght(ansvars) + other jvars ?? not straightforward.
jvnames = NULL # TODO: not let jvnames grow, maybe use (number of lapply(.SD, .))*lenght(ansvarsnew) + other jvars ?? not straightforward.
# Fix for #744. Don't use 'i' in for-loops. It masks the 'i' from the input!!
for (i_ in 2:length(jsubl)) {
this = jsub[[i_]]
if (is.name(this)) {
if (this == ".SD") { # optimise '.SD' alone
any_SD = TRUE
jsubl[[i_]] = lapply(ansvars, as.name)
jvnames = c(jvnames, ansvars)
jsubl[[i_]] = lapply(ansvarsnew, as.name)
jvnames = c(jvnames, ansvarsnew)
} else if (this == ".N") {
# don't optimise .I in c(.SD, .I), it's length can be > 1
# only c(.SD, list(.I)) should be optimised!! .N is always length 1.
Expand Down Expand Up @@ -1522,8 +1540,8 @@ chmatch2 <- function(x, table, nomatch=NA_integer_) {
this[[2L]] == ".SD" && (is.numeric(this[[3L]]) || this[[3L]] == ".N") ) {
# optimise .SD[1] or .SD[2L]. Not sure how to test .SD[a] as to whether a is numeric/integer or a data.table, yet.
any_SD = TRUE
jsubl[[i_]] = lapply(ansvars, function(x) { this[[2L]] = as.name(x); this })
jvnames = c(jvnames, ansvars)
jsubl[[i_]] = lapply(ansvarsnew, function(x) { this[[2L]] = as.name(x); this })
jvnames = c(jvnames, ansvarsnew)
} else if (any(all.vars(this) == ".SD")) {
# TODO, TO DO: revisit complex cases (as illustrated below)
# complex cases like DT[, c(.SD[x>1], .SD[J(.)], c(.SD), a + .SD, lapply(.SD, sum)), by=grp]
Expand Down
2 changes: 2 additions & 0 deletions README.md
Expand Up @@ -64,6 +64,8 @@

24. `uniqueN` gains `na.rm` argument, [#1455](https://github.com/Rdatatable/data.table/issues/1455).

25. Finally, we can now refer to the columns that are not mentioned in `.SD/.SDcols` in `j` as well. For example, `DT[, .(sum(v1), lapply(.SD, mean)), by=grp, .SDcols=v2:V3]` works fine, [#495](https://github.com/Rdatatable/data.table/issues/495).

#### BUG FIXES

1. Now compiles and runs on IBM AIX gcc. Thanks to Vinh Nguyen for investigation and testing, [#1351](https://github.com/Rdatatable/data.table/issues/1351).
Expand Down
36 changes: 36 additions & 0 deletions inst/tests/tests.Rraw
Expand Up @@ -7807,6 +7807,42 @@ test(1628.2, uniqueN(dt, na.rm=TRUE), nrow(na.omit(dt[, .N, by=.(x,y,z)])))
test(1628.3, dt[, uniqueN(y, na.rm=TRUE), by=z], dt[, length(unique(na.omit(y))), by=z])
test(1628.4, dt[, uniqueN(.SD, na.rm=TRUE), by=z], dt[, nrow(na.omit(.SD[, .N, by=.(x,y)])), by=z])

# fix for long standing FR/bug, #495
# most likely I'm missing some tests, but we'll fix/add them as we go along.
dt = data.table(grp=c(2,3,3,1,1,2,3), v1=1:7, v2=7:1, v3=10:16)
test(1629.1, dt[, .SD*v1, .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
test(1629.2, dt[, lapply(.SD, function(x) x*v1), .SDcols=v2:v3], dt[, .(v2=v2*v1, v3=v3*v1)])
test(1629.3, dt[, lapply(.SD, function(x) mean(x)*sum(v1)), .SDcols=v2:v3], data.table(v2=112, v3=364))
test(1629.4, dt[, c(sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(V1=28L, v2=4, v3=13))
test(1629.5, dt[, c(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, v2=4, v3=13))
test(1629.6, dt[, .(v1=sum(v1), lapply(.SD, mean)), .SDcols=v2:v3], data.table(v1=28L, V2=list(4,13)))
test(1629.7, dt[0][, .SD*v1, .SDcols=v2:v3], dt[0][, .SD, .SDcols=v2:v3])
# add/update
dt2 = copy(dt)
test(1629.8, dt2[, c("v2", "v3") := .SD*v1, .SDcols=v2:v3], dt[, .(grp, v1, v2=v2*v1, v3=v3*v1)])
# grouping operations
oldopts = getOption("datatable.optimize") # backup
options(datatable.optimize = 1L) # no gforce
test(1629.9, dt[, .SD*sum(v1), by=grp, .SDcols=v2:v3], dt[, .SD*sum(v1), by=grp][, v1 := NULL])
ans1 = dt[, sum(v1), by=grp]
ans2 = dt[, base::max(.SD), by=grp, .SDcols=v2:v3]
test(1629.10, dt[, max(.SD)*sum(v1), by=grp, .SDcols=v2:v3], ans1[, .(grp, V1=V1*ans2$V1)])
test(1629.11, dt[, lapply(.SD, function(x) weighted.mean(x, w=v2)), .SDcols=c("v1","v3"), by=grp],
dt[, .(v1=weighted.mean(v1,w=v2), v3=weighted.mean(v3, w=v2)), by=grp])
test(1629.12, dt[, c(v1=max(v1), lapply(.SD, base::min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
# gforce
options(datatable.optimize = Inf) # Inf
test(1629.13, dt[, c(v1=max(v1), lapply(.SD, min)), by=grp, .SDcols=v2:v3], dt[, .(v1=max(v1), v2=min(v2), v3=min(v3)), by=grp])
# even more complex, shouldn't run any optimisation
dt[, v4 := v1/2]
test(1629.14, dt[, c(.(v1=v1*min(v4)), lapply(.SD, function(x) x*max(v4))), by=grp, .SDcols=v2:v3],
dt[, .(v1=v1*min(v4), v2=v2*max(v4), v3=v3*max(v4)), by=grp])
test(1629.15, copy(dt)[, c("a", "b", "c") := c(min(v1), lapply(.SD, function(x) max(x)*min(v1))), by=grp, .SDcols=v3:v4], copy(dt)[, c("a", "b", "c") := .(min(v1), max(v3)*min(v1), max(v4)*min(v1)), by=grp])
options(datatable.optimize = oldopts)
# by=.EACHI and operations with 'i'
test(1629.16, dt[.(2:3), c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=.EACHI, .SDcols=v2:v3, on="grp"], dt[grp %in% 2:3, c(.(sum(v1)), lapply(.SD, function(x) mean(x)*min(v1))), by=grp, .SDcols=v2:v3])
test(1629.17, dt[.(2:3), c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3, on="grp"][order(V1,v2,v3)], dt[grp %in% 2:3, c(sum(v1), lapply(.SD, function(x) mean(x)*v1)), .SDcols=v2:v3][order(V1,v2,v3)])

##########################

# TODO: Tests involving GForce functions needs to be run with optimisation level 1 and 2, so that both functions are tested all the time.
Expand Down

0 comments on commit 68091d8

Please sign in to comment.