Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

copy shared columns before setkey #3768

Merged
merged 10 commits into from Aug 27, 2019
12 changes: 6 additions & 6 deletions R/setkey.R
Expand Up @@ -41,6 +41,12 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
setattr(x,"index",NULL) # setkey(DT,NULL) also clears secondary keys. setindex(DT,NULL) just clears secondary keys.
return(invisible(x))
}
if (!missing(verbose)) {
stopifnot(isTRUEorFALSE(verbose))
# set the global verbose option because that is fetched from C code without having to pass it through
oldverbose = options(datatable.verbose=verbose)
on.exit(options(oldverbose))
}
if (!is.data.table(x)) stop("x is not a data.table")
if (!is.character(cols)) stop("cols is not a character vector. Please see further information in ?setkey.")
if (physical && identical(attr(x, ".data.table.locked", exact=TRUE),TRUE)) stop("Setting a physical key on .SD is reserved for possible future use; to modify the original data's order by group. Try setindex() instead. Or, set*(copy(.SD)) as a (slow) last resort.")
Expand Down Expand Up @@ -102,12 +108,6 @@ setkeyv = function(x, cols, verbose=getOption("datatable.verbose"), physical=TRU
}
setattr(x,"index",NULL) # TO DO: reorder existing indexes likely faster than rebuilding again. Allow optionally. Simpler for now to clear.
if (length(o)) {
# table has deep-duplicate columns, e.g. #3496
if (any(idx <- duplicated(vapply_1c(x, address)))) {
if (verbose) { last.started.at=proc.time(); cat('Deep duplicate columns (identical address in memory) detected... copying', sum(idx), 'columns ...'); flush.console() }
for (j in which(idx)) set(x, NULL, j, copy(x[[j]]))
if (verbose) { cat("done in", timetaken(last.started.at), "\n"); flush.console() }
}
if (verbose) { last.started.at = proc.time() }
.Call(Creorder,x,o)
if (verbose) { cat("reorder took", timetaken(last.started.at), "\n"); flush.console() }
Expand Down
10 changes: 8 additions & 2 deletions inst/tests/tests.Rraw
Expand Up @@ -15778,8 +15778,14 @@ test(2087.1, x[a == 1L, .(b, b2=b)][ , !identical(address(b), address(b2))])
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
setDT(x)
test(2087.2, setkey(x, a, verbose=TRUE), output='Deep duplicate columns')
test(2087.3, x, data.table(a=paste0(1:2), b=paste0(1:2), key='a'))
test(2087.2, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), key="a"),
output='Found and copied 1 column with a shared memory address')
x = data.frame(a=paste0(2:1), stringsAsFactors=FALSE)
x$b = x$a
x$c = x$a
setDT(x)
test(2087.3, setkey(x, a, verbose=TRUE), data.table(a=paste0(1:2), b=paste0(1:2), c=paste0(1:2), key="a"),
output='Found and copied 2 columns with a shared memory address')
## #2245 unset .data.table.locked even if is.null(irows)
x <- data.table(a=c(0.85, -0.38, 1.19), b=c(0.56, 0.63, -1.30))
test(2087.4, x[ , round(.SD, 1)][ , c := 8], data.table(a=c(.8, -.4, 1.2), b=c(.6,.6,-1.3), c=8))
Expand Down
1 change: 1 addition & 0 deletions src/data.table.h
Expand Up @@ -216,4 +216,5 @@ void coerceFill(SEXP fill, double *dfill, int32_t *ifill, int64_t *i64fill);
SEXP coerceFillR(SEXP fill);
bool INHERITS(SEXP x, SEXP char_);
bool Rinherits(SEXP x, SEXP char_);
void copySharedColumns(SEXP x);

2 changes: 1 addition & 1 deletion src/reorder.c
Expand Up @@ -21,6 +21,7 @@ SEXP reorder(SEXP x, SEXP order)
maxSize=SIZEOF(v);
if (ALTREP(v)) SET_VECTOR_ELT(x,i,duplicate(v)); // expand compact vector in place ready for reordering by reference
}
copySharedColumns(x); // otherwise two columns which point to the same vector would be reordered and then re-reordered, issues linked in PR#3768
} else {
if (SIZEOF(x)!=4 && SIZEOF(x)!=8 && SIZEOF(x)!=16)
error("reorder accepts vectors but this non-VECSXP is type '%s' which isn't yet supported (SIZEOF=%d)", type2char(TYPEOF(x)), SIZEOF(x));
Expand All @@ -34,7 +35,6 @@ SEXP reorder(SEXP x, SEXP order)
int nprotect = 0;
if (ALTREP(order)) { order=PROTECT(duplicate(order)); nprotect++; } // TODO: how to fetch range of ALTREP compact vector


const int *restrict idx = INTEGER(order);
int i=0;
while (i<nrow && idx[i] == i+1) i++;
Expand Down
36 changes: 36 additions & 0 deletions src/utils.c
Expand Up @@ -173,3 +173,39 @@ bool Rinherits(SEXP x, SEXP char_) {
return ans;
}

void copySharedColumns(SEXP x) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

very nice!

const int ncol = length(x);
if (!isNewList(x) || ncol==1) return;
bool *shared = (bool *)R_alloc(ncol, sizeof(bool)); // on R heap in case duplicate() fails
int *savetl = (int *)R_alloc(ncol, sizeof(int)); // on R heap for convenience but could be a calloc
int nShared = 0;
const SEXP *xp = VECTOR_PTR(x);
for (int i=0; i<ncol; ++i) {
SEXP thiscol = xp[i];
const int thistl = TRUELENGTH(thiscol);
if (thistl<0) {
shared[i] = true;
nShared++;
// do not duplicate() here as the duplicate() might fail. Careful to restore tl first to all columns.
// Aside: thistl is which column shares the same address as this one in case that's ever useful in future.
} else {
shared[i] = false;
savetl[i] = thistl; // these are vectors which are all expected to have tl, unlike CHARSXP which often don't (savetl() has CHARSXP in mind)
SET_TRUELENGTH(thiscol, -i-1);
}
}
// now we know nShared and which ones they are (if any), restore original tl back to all columns
for (int i=0; i<ncol; ++i) {
if (!shared[i]) SET_TRUELENGTH(VECTOR_ELT(x, i), savetl[i]);
}
// now that truelength has been restored for all columns, we can finally call duplicate()
if (nShared) {
for (int i=0; i<ncol; ++i) {
if (shared[i])
SET_VECTOR_ELT(x, i, duplicate(VECTOR_ELT(x, i)));
}
if (GetVerbose()) Rprintf("Found and copied %d column%s with a shared memory address\n", nShared, nShared>1?"s":"");
// GetVerbose() (slightly expensive call of all options) called here only when needed
}
}