Fixed duplicate column names in merge() when by.x in names(y) (#2631)

Rdatatable · Feb 23, 2018 · b392bfe · b392bfe
1 parent e2386ab
commit b392bfe
Show file tree

Hide file tree

Showing 4 changed files with 40 additions and 6 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -139,6 +139,10 @@ Thanks to @MichaelChirico for reporting and to @MarkusBonsch for the implementat
 
 33. `setattr()` no longer segfaults when setting 'class' to empty character vector, [#2386](https://github.com/Rdatatable/data.table/issues/2386). Thanks to @hatal175 for reporting and to @MarkusBonsch for fixing.
 
+34. Fixed bug where result of `merge()` would contain duplicate column names if `by.x` was also in `names(y)`.
+Where there are duplicate column names (i.e. `suffixes = c("", "")`) `merge()` will throw a warning to match
+the behaviour of `base:::merge.data.frame()`. Thanks to @sritchie73 for reporting and fixing [PR#2631](https://github.com/Rdatatable/data.table/pull/2631). 
+
 #### NOTES
 
 0. The license has been changed from GPL to MPL (Mozilla Public License). All contributors were consulted and approved. [PR#2456](https://github.com/Rdatatable/data.table/pull/2456) details the reasons for the change.

diff --git a/R/merge.R b/R/merge.R
@@ -51,6 +51,13 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
     start[chmatch(dupnames, start, 0L)] = paste(dupnames, suffixes[1L], sep="")
     end[chmatch(dupnames, end, 0L)] = paste(dupnames, suffixes[2L], sep="")
   }
+  # If by.x != by.y then the 'by' column(s) are named as 'by.x' - we need
+  # to also handle cases where the 'by.x' column names are in 'end'
+  dupkeyx = intersect(by.x, end)
+  if (length(dupkeyx)) {
+    by.x[chmatch(dupkeyx, by.x, 0L)] = paste(dupkeyx, suffixes[1L], sep="")
+    end[chmatch(dupkeyx, end, 0L)] = paste(dupkeyx, suffixes[2L], sep="")
+  }
 
   dt = y[x,nomatch = if (all.x) NA else 0L,on=by,allow.cartesian=allow.cartesian]   # includes JIS columns (with a i. prefix if conflict with x names)
 
@@ -78,6 +85,15 @@ merge.data.table <- function(x, y, by = NULL, by.x = NULL, by.y = NULL, all = FA
   if (nrow(dt) > 0L) {
     setkeyv(dt, if (sort) by.x else NULL)
   }
+
+  # Throw warning if there are duplicate column names in 'dt' (i.e. if
+  # `suffixes=c("","")`, to match behaviour in base:::merge.data.frame)
+  resultdupnames <- names(dt)[duplicated(names(dt))]
+  if (length(resultdupnames)) {
+    warning("column names ", paste0("'", resultdupnames, "'", collapse=", "), 
+            " are duplicated in the result")
+  }
+
   # merge resets class, #1378. X[Y] is quite clear that X is being *subset* by Y,
   # makes sense to therefore retain X's class, unlike `merge`. Hard to tell what
   # class to retain for *full join* for example.

diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw
@@ -7163,15 +7163,12 @@ ans2 <- setDF(merge(setDT(d1), setDT(d2), all=TRUE, by="A"))
 test(1542.2, ans1, ans2)
 # test duplicate name cases
 setnames(d2, c("A", "Y"), c("B", "A"))
-ans1 <- suppressWarnings(merge(setDF(d1), setDF(d2), by.x="A", by.y="B"))
-ans2 <- setDF(merge(setDT(d1), setDT(d2), by.x="A", by.y="B"))
-test(1543.3, ans1, ans2)
 ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), by.x="B", by.y="A"))
 ans2 <- setDF(merge(setDT(d2), setDT(d1), by.x="B", by.y="A"))
-test(1543.4, ans1, ans2)
+test(1543.3, ans1, ans2)
 ans1 <- suppressWarnings(merge(setDF(d2), setDF(d1), all=TRUE, by.x="B", by.y="A"))
 ans2 <- setDF(merge(setDT(d2), setDT(d1), all=TRUE, by.x="B", by.y="A"))
-test(1543.5, ans1, ans2)
+test(1543.4, ans1, ans2)
 
 # test for sort=FALSE argument, #1282
 set.seed(1L)
@@ -11717,6 +11714,18 @@ test(1879.6, fread(f, verbose=TRUE), DT,
                   sep = '.*'))
 unlink(f)
 
+# Fix duplicated names arising in merge when by.x in names(y), PR#2631
+# 1880.1 should fail in there are any duplicate names after a join
+# 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names
+parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43))
+children = data.table(parent=c("Sarah", "Max", "Max"), 
+                       name=c("Oliver", "Sebastian", "Michelle"),
+                       sex=c("M", "M", "F"), age=c(5,8,7))
+joined = merge(parents, children, by.x="name", by.y="parent")                       
+test(1880.1, length(names(joined)), length(unique(names(joined)))) 
+test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L,
+             warning = "column names.*are duplicated in the result")
+
 
 ###################################
 #  Add new tests above this line  #
@@ -11737,4 +11746,3 @@ if (nfail > 0) {
 cat("\n",plat,"\n\nAll ",ntest," tests in inst/tests/tests.Rraw completed ok in ",timetaken(started.at)," on ",date(),"\n",sep="")
 # date() is included so we can tell exactly when these tests ran on CRAN. Sometimes a CRAN log can show error but that can be just
 # stale due to not updating yet since a fix in R-devel, for example.
-
diff --git a/man/merge.Rd b/man/merge.Rd
@@ -66,6 +66,12 @@ control for providing the columns to merge on with the help of the newly impleme
 For a more \code{data.table}-centric way of merging two \code{data.table}s, see
 \code{\link{[.data.table}}; e.g., \code{x[y, ...]}. See FAQ 1.12 for a detailed
 comparison of \code{merge} and \code{x[y, ...]}.
+
+If any column names provided to \code{by.x} also occur in \code{names(y)} but not in \code{by.y},
+then this \code{data.table} method will add the \code{suffixes} to those column names. As of
+R v3.4.3, the \code{data.frame} method will not (leading to duplicate column names in the result) but a patch has
+been proposed (see thread \href{http://r.789695.n4.nabble.com/Duplicate-column-names-created-by-base-merge-when-by-x-has-the-same-name-as-a-column-in-y-td4748345.html}{here})
+which is looking likely to be accepted for a future version of R.
 }
 
 \value{