rbindlist support fill=TRUE with use.names=FALSE and use it in merge.…

…R ToDo of #678 (#5263)
Rdatatable · Nov 23, 2021 · 4922384 · 4922384
1 parent d8dc315
commit 4922384
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 15 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -206,6 +206,51 @@
     #   v1.9.6 18.5400 19.1800 21.5100 20.6900 23.4200 29.040   100
     #  v1.14.4  0.4826  0.5586  0.6586  0.6329  0.7348  1.318   100
     ```
+
+31. `rbind()` and `rbindlist()` now support `fill=TRUE` with `use.names=FALSE` instead of issuing the warning `use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.`
+
+    ```R
+    DT1
+    #        A     B
+    #    <int> <int>
+    # 1:     1     5
+    # 2:     2     6
+
+    DT2
+    #      foo
+    #    <int>
+    # 1:     3
+    # 2:     4
+
+    rbind(DT1, DT2, fill=TRUE)   # no change
+    #        A     B   foo
+    #    <int> <int> <int>
+    # 1:     1     5    NA
+    # 2:     2     6    NA
+    # 3:    NA    NA     3
+    # 4:    NA    NA     4
+
+    rbind(DT1, DT2, fill=TRUE, use.names=FALSE)
+
+    # was:
+    #        A     B   foo
+    #    <int> <int> <int>
+    # 1:     1     5    NA
+    # 2:     2     6    NA
+    # 3:    NA    NA     3
+    # 4:    NA    NA     4
+    # Warning message:
+    # In rbindlist(l, use.names, fill, idcol) :
+    #   use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.
+
+    # now:
+    #        A     B
+    #    <int> <int>
+    # 1:     1     5
+    # 2:     2     6
+    # 3:     3    NA
+    # 4:     4    NA
+    ```
 
 ## BUG FIXES
 

diff --git a/R/merge.R b/R/merge.R
@@ -78,16 +78,7 @@ merge.data.table = function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FAL
     # Perhaps not very commonly used, so not a huge deal that the join is redone here.
     missingyidx = y[!x, which=TRUE, on=by, allow.cartesian=allow.cartesian]
     if (length(missingyidx)) {
-      yy = y[missingyidx]
-      othercolsx = setdiff(nm_x, by)
-      if (length(othercolsx)) {
-        tmp = rep.int(NA_integer_, length(missingyidx))
-        # TO DO: use set() here instead..
-        yy = cbind(yy, x[tmp, othercolsx, with = FALSE])
-      }
-      # empty data.tables (nrow =0, ncol>0) doesn't skip names anymore in new rbindlist
-      # takes care of #24 without having to save names. This is how it should be, IMHO.
-      dt = rbind(dt, yy, use.names=FALSE)
+      dt = rbind(dt, y[missingyidx], use.names=FALSE, fill=TRUE)
     }
   }
   # X[Y] syntax puts JIS i columns at the end, merge likes them alongside i.

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -1863,6 +1863,8 @@ test(628.2, rbind(data.table(a=1:3,b=factor(letters[1:3]),c=factor("foo")), list
 # Test merge with common names and all.y=TRUE, #2011
 DT1 = data.table(a=c(1,3,4,5), total=c(2,1,3,1), key="a")
 DT2 = data.table(a=c(2,3,5), total=c(5,1,2), key="a")
+DT3 = data.table(a=c(2), total=c(5), key="a")
+DT4 = data.table(a=c(3), total=c(1), key="a")
 # 629+630 worked before anyway.  631+632 test the bug fix.
 adf=as.data.frame
 adt=as.data.table
@@ -1875,6 +1877,16 @@ test(630.1, merge(DT1,DT2,all.x=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a"
 
 test(631, merge(DT1,DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=c(NA,1,1),total.y=c(5,1,2),key="a"))
 test(631.1, merge(DT1,DT2,all.y=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all.y=TRUE)),a))
+# ensure merge(x,y,all.y) does not alter input y ...
+# .. i subset y with 1:nrow(y)
+test(631.2, merge(DT1[c(1,3)],DT2,all.y=TRUE), data.table(a=c(2,3,5),total.x=NA_real_,total.y=c(5,1,2),key="a"))
+test(631.3, DT2, data.table(a=c(2,3,5), total=c(5,1,2), key="a"))
+# .. nrow(y)=1, i subset y with 1 and no match with x
+test(631.4, merge(DT1,DT3,all.y=TRUE), data.table(a=c(2),total.x=NA_real_,total.y=c(5),key="a"))
+test(631.5, DT3, data.table(a=c(2), total=c(5), key="a"))
+# .. nrow(y)=1, i subset y with 1 and match with x
+test(631.6, merge(DT1,DT4,all.y=TRUE), data.table(a=c(3),total.x=c(1),total.y=c(1),key="a"))
+test(631.7, DT4, data.table(a=c(3), total=c(1), key="a"))
 
 test(632, merge(DT1,DT2,all=TRUE), data.table(a=c(1,2,3,4,5),total.x=c(2,NA,1,3,1),total.y=c(NA,5,1,NA,2),key="a"))
 test(632.1, merge(DT1,DT2,all=TRUE), setkey(adt(merge(adf(DT1),adf(DT2),by="a",all=TRUE)),a))
@@ -14577,8 +14589,11 @@ test(2002.12, rbind(DT1, DT2, idcol='id'),     data.table(id=integer(), a=logica
 test(2003.1, rbindlist(list(), use.names=1), error="use.names= should be TRUE, FALSE, or not used [(]\"check\" by default[)]")
 test(2003.2, rbindlist(list(), fill=1), error="fill= should be TRUE or FALSE")
 test(2003.3, rbindlist(list(data.table(a=1:2), data.table(b=3:4)), fill=TRUE, use.names=FALSE),
-             data.table(a=c(1:2,NA,NA), b=c(NA,NA,3:4)),
-             warning="use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE")
+             data.table(a=c(1:4)))
+test(2003.4, rbindlist(list(data.table(a=1:2,c=5:6), data.table(b=3:4)), fill=TRUE, use.names=FALSE),
+             data.table(a=c(1:4), c=INT(5,6,NA,NA)))
+test(2003.5, rbindlist(list(data.table(a=1:2), data.table(b=3:4, c=5:6)), fill=TRUE, use.names=FALSE),
+             data.table(a=c(1:4), V1=INT(NA,NA,5,6)))
 
 # chmatch coverage for two different non-ascii encodings matching; issues mentioned in comments in chmatch.c #69 #2538 #111
 x1 = "fa\xE7ile"

diff --git a/man/rbindlist.Rd b/man/rbindlist.Rd
@@ -13,7 +13,7 @@ rbindlist(l, use.names="check", fill=FALSE, idcol=NULL)
 \arguments{
   \item{l}{ A list containing \code{data.table}, \code{data.frame} or \code{list} objects. \code{\dots} is the same but you pass the objects by name separately. }
   \item{use.names}{\code{TRUE} binds by matching column name, \code{FALSE} by position. `check` (default) warns if all items don't have the same names in the same order and then currently proceeds as if `use.names=FALSE` for backwards compatibility (\code{TRUE} in future); see news for v1.12.2.}
-  \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}. When \code{TRUE}, \code{use.names} is set to \code{TRUE}.}
+  \item{fill}{\code{TRUE} fills missing columns with NAs. By default \code{FALSE}.}
   \item{idcol}{Creates a column in the result showing which list item those rows came from. \code{TRUE} names this column \code{".id"}. \code{idcol="file"} names this column \code{"file"}. If the input list has names, those names are the values placed in this id column, otherwise the values are an integer vector \code{1:length(l)}. See \code{examples}.}
 }
 \details{

diff --git a/src/rbindlist.c b/src/rbindlist.c
@@ -12,8 +12,7 @@ SEXP rbindlist(SEXP l, SEXP usenamesArg, SEXP fillArg, SEXP idcolArg)
   if (TYPEOF(l) != VECSXP) error(_("Input to rbindlist must be a list. This list can contain data.tables, data.frames or plain lists."));
   Rboolean usenames = LOGICAL(usenamesArg)[0];
   const bool fill = LOGICAL(fillArg)[0];
-  if (fill && usenames!=TRUE) {
-    if (usenames==FALSE) warning(_("use.names= cannot be FALSE when fill is TRUE. Setting use.names=TRUE.")); // else no warning if usenames==NA (default)
+  if (fill && usenames==NA_LOGICAL) {
     usenames=TRUE;
   }
   const bool idcol = !isNull(idcolArg);