Skip to content

Commit

Permalink
Closes #1596 -- CJ now auto-names its input
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Chirico committed Jul 17, 2018
1 parent c72a977 commit 4ed2f2d
Show file tree
Hide file tree
Showing 5 changed files with 44 additions and 29 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -5,6 +5,8 @@

#### NEW FEATURES

1. `CJ()` now auto-names its inputs exactly as `data.table()` does, [#1596](https://github.com/Rdatatable/data.table/issues/1596). Code that relies on `V1`, `V2`, ... naming will no longer work as expected. Thanks @franknarf1 for the suggestion.

#### BUG FIXES

#### NOTES
Expand Down
31 changes: 10 additions & 21 deletions R/data.table.R
Expand Up @@ -49,31 +49,20 @@ data.table <-function(..., keep.rownames=FALSE, check.names=FALSE, key=NULL, str
.Call(CcopyNamedInList,x) # to maintain pre-Rv3.1.0 behaviour, for now. See test 548.2. TODO: revist
# TODO Something strange with NAMED on components of `...` to investigate. Or, just port data.table() to C.

# fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top.
if (identical(x, list(NULL)) || identical(x, list(list())) ||
identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() )
tt <- as.list(substitute(list(...)))[-1L] # Intention here is that data.table(X,Y) will automatically put X and Y as the column names. For longer expressions, name the arguments to data.table(). But in a call to [.data.table, wrap in list() e.g. DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names
vnames = names(tt)
if (is.null(vnames)) vnames = rep.int("",length(x))
vnames[is.na(vnames)] = ""
novname = vnames==""
if (any(!novname)) {
if (any(vnames[!novname] == ".SD")) stop("A column may not be called .SD. That has special meaning.")
}
for (i in which(novname)) {
# if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.))
if (is.null(ncol(x[[i]]))) {
if ((tmp <- deparse(tt[[i]])[1L]) == make.names(tmp))
vnames[i] <- tmp
}
}
tt = vnames==""
if (any(tt)) vnames[tt] = paste0("V", which(tt))
# so now finally we have good column names. We also will use novname later to know which were explicitly supplied in the call.
n <- length(x)
if (n < 1L)
return( null.data.table() )
# fix for #5377 - data.table(null list, data.frame and data.table) should return null data.table. Simple fix: check all scenarios here at the top.
if (identical(x, list(NULL)) || identical(x, list(list())) ||
identical(x, list(data.frame(NULL))) || identical(x, list(data.table(NULL)))) return( null.data.table() )
nd = name_dots(...)
vnames = nd$vnames
# We will use novname later to know which were explicitly supplied in the call.
novname = nd$novname
if (length(vnames) != n) stop("logical error in vnames")
# cast to a list to facilitate naming of columns with dimension --
# unlist() at the end automatically handles the need to "push" names
# to accommodate the "new" columns
vnames <- as.list.default(vnames)
nrows = integer(n) # vector of lengths of each column. may not be equal if silent repetition is required.
numcols = integer(n) # the ncols of each of the inputs (e.g. if inputs contain matrix or data.table)
Expand Down
7 changes: 1 addition & 6 deletions R/setkey.R
Expand Up @@ -390,13 +390,8 @@ CJ <- function(..., sorted = TRUE, unique = FALSE)
}
setattr(l, "row.names", .set_row_names(length(l[[1L]])))
setattr(l, "class", c("data.table", "data.frame"))
setattr(l, "names", name_dots(...)$vnames)

if (is.null(vnames <- names(l)))
vnames = vector("character", length(l))
if (any(tt <- vnames == "")) {
vnames[tt] = paste0("V", which(tt))
setattr(l, "names", vnames)
}
l <- alloc.col(l) # a tiny bit wasteful to over-allocate a fixed join table (column slots only), doing it anyway for consistency, and it's possible a user may wish to use SJ directly outside a join and would expect consistent over-allocation.
if (sorted) {
if (!dups) setattr(l, 'sorted', names(l))
Expand Down
29 changes: 29 additions & 0 deletions R/utils.R
Expand Up @@ -71,3 +71,32 @@ vapply_1i <- function (x, fun, ..., use.names = TRUE) {

more = function(f) system(paste("more",f)) # nocov (just a dev helper)

# helper used to auto-name columns in data.table(x, y) as x, y (and similar)
# from the original comment in data.table.R:
# Intention here is that data.table(X,Y) will automatically put X and Y
# as the column names. For longer expressions, name the arguments to
# data.table(). But in a call to [.data.table, wrap in list() e.g.
# DT[,list(a=mean(v),b=foobarzoo(zang))] will get the col names
# naming of unnested matrices still handled by data.table()
name_dots <- function(...) {
dot_sub <- as.list(substitute(list(...)))[-1L]
vnames = names(dot_sub)
l = list(...)
if (is.null(vnames)) vnames = rep.int("", length(l))
vnames[is.na(vnames)] = ""
novname = vnames==""
if (any(idx <- !novname)) {
if (any(vnames[idx] == ".SD")) stop("A column may not be called .SD. That has special meaning.")
}
for (i in which(novname)) {
# if (ncol(as.data.table(x[[i]])) <= 1) { # cbind call in test 230 fails if I write ncol(as.data.table(eval(tt[[i]], parent.frame()))) <= 1, no idea why... (keep this for later even though all tests pass with ncol(.).. because base uses as.data.frame(.))
if (is.null(ncol(l[[i]]))) {
if ((tmp <- deparse(dot_sub[[i]])[1L]) == make.names(tmp))
vnames[i] <- tmp
}
}
still_empty = vnames==""
if (any(still_empty)) vnames[still_empty] = paste0("V", which(still_empty))
list(vnames = vnames,
novname = novname)
}
4 changes: 2 additions & 2 deletions inst/tests/tests.Rraw
Expand Up @@ -2709,7 +2709,7 @@ test(995, DT[CJ(c(5,3), c(5,1), sorted=FALSE)], OUT)
# CJ with ordered factor
xx <- factor(letters[1:2], ordered=TRUE)
yy <- sample(2)
test(996, CJ(xx, yy), setkey(data.table(rep(xx, each=2), rep(base::sort.int(yy), 2))))
test(996, CJ(xx, yy), setkey(data.table(xx = rep(xx, each=2), yy = rep(base::sort.int(yy), 2))))

# That CJ orders NA consistently with setkey and historically, now it doesn't use setkey.
# NA must always come first in data.table throughout, since binary search relies on that internally.
Expand Down Expand Up @@ -6799,7 +6799,7 @@ test(1524, ans1, ans2)
# 'unique =' argument for CJ, #1148
x = c(1, 2, 1)
y = c(5, 8, 8, 4)
test(1525, CJ(x, y, unique=TRUE), CJ(c(1,2), c(4,5,8)))
test(1525, CJ(x, y, unique=TRUE), CJ(x = c(1,2), y = c(4,5,8)))

# `key` argument fix for `setDT` when input is already a `data.table`, #1169
DT <- data.table(A = 1:4, B = 5:8)
Expand Down

0 comments on commit 4ed2f2d

Please sign in to comment.