Skip to content

Commit

Permalink
copy shared columns before setkey (#3768)
Browse files Browse the repository at this point in the history
  • Loading branch information
Michael Chirico authored and mattdowle committed Aug 27, 2019
1 parent 81af9b6 commit 0f39178
Show file tree
Hide file tree
Showing 7 changed files with 68 additions and 9 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Expand Up @@ -234,6 +234,8 @@

30. `groupingsets` functions now properly handle alone special symbols when using an empty set to group by, [#3653](https://github.com/Rdatatable/data.table/issues/3653). Thanks to @Henrik-P for the report.

31. A `data.table` created using `setDT()` on a `data.frame` containing identical columns referencing each other would cause `setkey()` to return incorrect results, [#3496](https://github.com/Rdatatable/data.table/issues/3496) and [#3766](https://github.com/Rdatatable/data.table/issues/3766). Thanks @kirillmayantsev and @alex46015 for reporting, and @jaapwalhout and @Atrebas for helping to debug and isolate the issue.

#### NOTES

1. `rbindlist`'s `use.names="check"` now emits its message for automatic column names (`"V[0-9]+"`) too, [#3484](https://github.com/Rdatatable/data.table/pull/3484). See news item 5 of v1.12.2 below.
Expand Down
5 changes: 3 additions & 2 deletions R/data.table.R
Expand Up @@ -1274,9 +1274,9 @@ replace_order = function(isub, verbose, env) {
if (jcpy) jval = copy(jval)
} else if (address(jval) == address(SDenv$.SD)) {
jval = copy(jval)
} else if ( length(jcpy <- which(vapply_1c(jval, address) %in% vapply_1c(SDenv, address))) ) {
} else if ( length(jcpy <- which(vapply_1c(jval, address) %chin% vapply_1c(SDenv, address))) ) {
for (jidx in jcpy) jval[[jidx]] = copy(jval[[jidx]])
} else if (is.call(jsub) && jsub[[1L]] == "get" && is.list(jval)) {
} else if (is.call(jsub) && jsub[[1L]] == "get") {
jval = copy(jval) # fix for #1212
}
} else {
Expand All @@ -1285,6 +1285,7 @@ replace_order = function(isub, verbose, env) {
if (!truelength(jval)) alloc.col(jval)
}
}

if (!is.null(lhs)) {
# TODO?: use set() here now that it can add new columns. Then remove newnames and alloc logic above.
.Call(Cassign,x,irows,cols,newnames,jval)
Expand Down
15 changes: 9 additions & 6 deletions R/setkey.R
Expand Up @@ -41,6 +41,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys.
return(invisible(x))
}
if (!missing(verbose)) {
stopifnot(isTRUEorFALSE(verbose))
# set the global verbose option because that is fetched from C code without having to pass it through
oldverbose = options(datatable.verbose=verbose)
on.exit(options(oldverbose))
}
if (!is.data.table(x)) stop("x is not a data.table")
if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.")
if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.")
Expand Down Expand Up @@ -102,12 +108,9 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
}
setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
if (length(o)) {
if (verbose) {
tt = suppressMessages(system.time(.Call(Creorder,x,o)))
cat("reorder took", tt["user.self"]+tt["sys.self"], "sec\n")
} else {
.Call(Creorder,x,o)
}
if (verbose) { last.started.at = proc.time() }
.Call(Creorder,x,o)
if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
} else {
if (verbose) cat("x is already ordered by these columns, no need to call reorder\n")
} # else empty integer() from forderv means x is already ordered by those cols, nothing to do.
Expand Down
16 changes: 16 additions & 0 deletions inst/tests/tests.Rraw
Expand Up @@ -15771,6 +15771,22 @@ test(2086.04, DT[ , sum(a), keyby = list()], data.table(V1=55L))
test(2086.05, DT[ , sum(a), by = character()], data.table(V1=55L))
test(2086.06, DT[ , sum(a), keyby = character()], data.table(V1=55L))

# simple queries can create tables with columns sharing the same address, #3766
x = data.table(a=1L, b=c(1L, 4L, 2L, 3L), c=4:1)
test(2087.1, x[a == 1L, .(b, b2=b)][ , identical(address(b), address(b2))])
# setkey detects and copies shared address columns, #3496
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
setDT(x)
test(2087.2, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"),
output='Found and copied 1 column with a shared memory address')
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
x$c = x$a
setDT(x)
test(2087.3, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"),
output='Found and copied 2 columns with a shared memory address')


###################################
# Add new tests above this line #
Expand Down
1 change: 1 addition & 0 deletions src/data.table.h
Expand Up @@ -216,4 +216,5 @@ void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill);
SEXP coerceFillR(SEXP fill);
bool INHERITS(SEXP x, SEXP char_);
bool Rinherits(SEXP x, SEXP char_);
void copySharedColumns(SEXP x);

2 changes: 1 addition & 1 deletion src/reorder.c
Expand Up @@ -21,6 +21,7 @@ SEXP reorder(SEXP x, SEXP order)
maxSize=SIZEOF(v);
if (ALTREP(v)) SET_VECTOR_ELT(x,i,duplicate(v)); // expand compact vector in place ready for reordering by reference
}
copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768
} else {
if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16)
error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)", type2char(TYPEOF(x)), SIZEOF(x));
Expand All @@ -34,7 +35,6 @@ SEXP reorder(SEXP x, SEXP order)
int nprotect = 0;
if (ALTREP(order)) { order=PROTECT(duplicate(order)); nprotect++; } // TODO: how to fetch range of ALTREP compact vector


const int *restrict idx = INTEGER(order);
int i=0;
while (i<nrow && idx[i] == i+1) i++;
Expand Down
36 changes: 36 additions & 0 deletions src/utils.c
Expand Up @@ -173,3 +173,39 @@ bool Rinherits(SEXP x, SEXP char_) {
return ans;
}

void copySharedColumns(SEXP x) {
const int ncol = length(x);
if (!isNewList(x) || ncol==1) return;
bool *shared = (bool *)R_alloc(ncol, sizeof(bool)); // on R heap in case duplicate() fails
int *savetl = (int *)R_alloc(ncol, sizeof(int)); // on R heap for convenience but could be a calloc
int nShared = 0;
const SEXP *xp = VECTOR_PTR(x);
for (int i=0; i<ncol; ++i) {
SEXP thiscol = xp[i];
const int thistl = TRUELENGTH(thiscol);
if (thistl<0) {
shared[i] = true;
nShared++;
// do not duplicate() here as the duplicate() might fail. Careful to restore tl first to all columns.
// Aside: thistl is which column shares the same address as this one in case that's ever useful in future.
} else {
shared[i] = false;
savetl[i] = thistl; // these are vectors which are all expected to have tl, unlike CHARSXP which often don't (savetl() has CHARSXP in mind)
SET_TRUELENGTH(thiscol, -i-1);
}
}
// now we know nShared and which ones they are (if any), restore original tl back to all columns
for (int i=0; i<ncol; ++i) {
if (!shared[i]) SET_TRUELENGTH(VECTOR_ELT(x, i), savetl[i]);
}
// now that truelength has been restored for all columns, we can finally call duplicate()
if (nShared) {
for (int i=0; i<ncol; ++i) {
if (shared[i])
SET_VECTOR_ELT(x, i, duplicate(VECTOR_ELT(x, i)));
}
if (GetVerbose()) Rprintf("Found and copied %d column%s with a shared memory address\n", nShared, nShared>1?"s":"");
// GetVerbose() (slightly expensive call of all options) called here only when needed
}
}

0 comments on commit 0f39178

Please sign in to comment.