From c5b5613f5ab51bf3910730a36ea2656570347be0 Mon Sep 17 00:00:00 2001 From: Matt Dowle Date: Fri, 23 Feb 2018 17:58:11 -0800 Subject: [PATCH] Out-of-sample quote rule bump with warning, #2265 --- NEWS.md | 2 +- inst/tests/tests.Rraw | 14 +++++++++++--- src/fread.c | 43 ++++++++++++++++++++++++++++--------------- 3 files changed, 40 insertions(+), 19 deletions(-) diff --git a/NEWS.md b/NEWS.md index 47da1d2c8..e47295738 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,7 +30,7 @@ * `skip=` and `nrow=` are more reliable and no longer affected by invalid lines outside the range specified. Thanks to Ziyad Saeed and Kyle Chung for reporting, [#1267](https://github.com/Rdatatable/data.table/issues/1267). Tests added. * Ram disk (`/dev/shm`) is no longer used for the output of system command input. Although faster when it worked, it was causing too many device full errors; e.g., [#1139](https://github.com/Rdatatable/data.table/issues/1139) and [zUMIs/19](https://github.com/sdparekh/zUMIs/issues/19). Thanks to Kyle Chung for reporting. Standard `tempdir()` is now used. If you wish to use ram disk, set TEMPDIR to `/dev/shm`; see `?tempdir`. * Detecting whether a very long input string is a file name or data is now much faster, [#2531](https://github.com/Rdatatable/data.table/issues/2531). Many thanks to @javrucebo for the detailed report, benchmarks and suggestions. - * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing dev and reporting these regressions before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518), [#2515](https://github.com/Rdatatable/data.table/issues/2515), [#1671](https://github.com/Rdatatable/data.table/issues/1671), [#2267](https://github.com/Rdatatable/data.table/issues/2267), [#2561](https://github.com/Rdatatable/data.table/issues/2561), [#2625](https://github.com/Rdatatable/data.table/issues/2625) + * Many thanks to @yaakovfeldman, Guillermo Ponce, Arun Srinivasan, Hugh Parsonage, Mark Klik, Pasha Stetsenko, Mahyar K, Tom Crockett, @cnoelke, @qinjs, @etienne-s, Mark Danese, Avraham Adler, @franknarf1, @MichaelChirico, @tdhock, Luke Tierney for testing dev and reporting these regressions before release to CRAN: [#2070](https://github.com/Rdatatable/data.table/issues/2070), [#2073](https://github.com/Rdatatable/data.table/issues/2073), [#2087](https://github.com/Rdatatable/data.table/issues/2087), [#2091](https://github.com/Rdatatable/data.table/issues/2091), [#2107](https://github.com/Rdatatable/data.table/issues/2107), [fst#50](https://github.com/fstpackage/fst/issues/50#issuecomment-294287846), [#2118](https://github.com/Rdatatable/data.table/issues/2118), [#2092](https://github.com/Rdatatable/data.table/issues/2092), [#1888](https://github.com/Rdatatable/data.table/issues/1888), [#2123](https://github.com/Rdatatable/data.table/issues/2123), [#2167](https://github.com/Rdatatable/data.table/issues/2167), [#2194](https://github.com/Rdatatable/data.table/issues/2194), [#2238](https://github.com/Rdatatable/data.table/issues/2238), [#2228](https://github.com/Rdatatable/data.table/issues/2228), [#1464](https://github.com/Rdatatable/data.table/issues/1464), [#2201](https://github.com/Rdatatable/data.table/issues/2201), [#2287](https://github.com/Rdatatable/data.table/issues/2287), [#2299](https://github.com/Rdatatable/data.table/issues/2299), [#2285](https://github.com/Rdatatable/data.table/issues/2285), [#2251](https://github.com/Rdatatable/data.table/issues/2251), [#2347](https://github.com/Rdatatable/data.table/issues/2347), [#2222](https://github.com/Rdatatable/data.table/issues/2222), [#2352](https://github.com/Rdatatable/data.table/issues/2352), [#2246](https://github.com/Rdatatable/data.table/issues/2246), [#2370](https://github.com/Rdatatable/data.table/issues/2370), [#2371](https://github.com/Rdatatable/data.table/issues/2371), [#2404](https://github.com/Rdatatable/data.table/issues/2404), [#2196](https://github.com/Rdatatable/data.table/issues/2196), [#2322](https://github.com/Rdatatable/data.table/issues/2322), [#2453](https://github.com/Rdatatable/data.table/issues/2453), [#2446](https://github.com/Rdatatable/data.table/issues/2446), [#2464](https://github.com/Rdatatable/data.table/issues/2464), [#2457](https://github.com/Rdatatable/data.table/issues/2457), [#1895](https://github.com/Rdatatable/data.table/issues/1895), [#2481](https://github.com/Rdatatable/data.table/pull/2481), [#2499](https://github.com/Rdatatable/data.table/issues/2499), [#2516](https://github.com/Rdatatable/data.table/issues/2516), [#2520](https://github.com/Rdatatable/data.table/issues/2520), [#2512](https://github.com/Rdatatable/data.table/issues/2512), [#2523](https://github.com/Rdatatable/data.table/issues/2523), [#2542](https://github.com/Rdatatable/data.table/issues/2542), [#2526](https://github.com/Rdatatable/data.table/issues/2526), [#2518](https://github.com/Rdatatable/data.table/issues/2518), [#2515](https://github.com/Rdatatable/data.table/issues/2515), [#1671](https://github.com/Rdatatable/data.table/issues/1671), [#2267](https://github.com/Rdatatable/data.table/issues/2267), [#2561](https://github.com/Rdatatable/data.table/issues/2561), [#2625](https://github.com/Rdatatable/data.table/issues/2625), [#2265](https://github.com/Rdatatable/data.table/issues/2265) 2. `fwrite()`: * empty strings are now always quoted (`,"",`) to distinguish them from `NA` which by default is still empty (`,,`) but can be changed using `na=` as before. If `na=` is provided and `quote=` is the default `'auto'` then `quote=` is set to `TRUE` so that if the `na=` value occurs in the data, it can be distinguished from `NA`. Thanks to Ethan Welty for the request [#2214](https://github.com/Rdatatable/data.table/issues/2214) and Pasha for the code change and tests, [#2215](https://github.com/Rdatatable/data.table/issues/2215). diff --git a/inst/tests/tests.Rraw b/inst/tests/tests.Rraw index 93ea0f50c..6fb833a62 100644 --- a/inst/tests/tests.Rraw +++ b/inst/tests/tests.Rraw @@ -11718,14 +11718,22 @@ unlink(f) # 1880.1 should fail in there are any duplicate names after a join # 1880.2 should fail if a warning is not thrown when suffixes leads to duplicate names parents = data.table(name=c("Sarah", "Max"), sex=c("F", "M"), age=c(41, 43)) -children = data.table(parent=c("Sarah", "Max", "Max"), +children = data.table(parent=c("Sarah", "Max", "Max"), name=c("Oliver", "Sebastian", "Michelle"), sex=c("M", "M", "F"), age=c(5,8,7)) -joined = merge(parents, children, by.x="name", by.y="parent") -test(1880.1, length(names(joined)), length(unique(names(joined)))) +joined = merge(parents, children, by.x="name", by.y="parent") +test(1880.1, length(names(joined)), length(unique(names(joined)))) test(1880.2, nrow(merge(parents, children, by.x="name", by.y="parent", suffixes=c("",""))), 3L, warning = "column names.*are duplicated in the result") +# out-of-sample quote rule bump, #2265 +DT = data.table(A=rep("abc", 10000), B="def") +DT[110, A:='"a"b'] +fwrite(DT, f<-tempfile(), quote=FALSE) +test(1881.1, ans<-fread(f), DT, warning='Found and resolved improper quoting. First healed line 111: <<"a"b,def>>') +test(1881.2, ans[110,A], '"a"b') # double-check the value of interest directly +unlink(f) + ################################### # Add new tests above this line # diff --git a/src/fread.c b/src/fread.c index aeb724381..9f1f4aa6b 100644 --- a/src/fread.c +++ b/src/fread.c @@ -1872,7 +1872,7 @@ int freadMain(freadMainArgs _args) { //********************************************************************************************* // [11] Read the data //********************************************************************************************* - bool stopTeam=false, firstTime=true; // bool for MT-safey (cannot ever read half written bool value) + bool stopTeam=false, firstTime=true, restartTeam=false; // bool for MT-safey (cannot ever read half written bool value badly) int nTypeBump=0, nTypeBumpCols=0; double tRead=0, tReread=0; double thRead=0, thPush=0; // reductions of timings within the parallel region @@ -1883,6 +1883,8 @@ int freadMain(freadMainArgs _args) { size_t DTi = 0; // the current row number in DT that we are writing to const char *headPos = pos; // the jump start corresponding to DTi int nSwept = 0; // count the number of dirty jumps that were swept + const char *quoteRuleBumpedCh = NULL; // in the very rare event of an out-of-sample quote rule bump, give a good warning message + size_t quoteRuleBumpedLine = -1; int buffGrown=0; // chunkBytes is the distance between each jump point; it decides the number of jumps // We may want each chunk to write to its own page of the final column, hence 1000*maxLen @@ -1920,6 +1922,7 @@ int freadMain(freadMainArgs _args) { if (verbose) DTPRINT("[11] Read the data\n"); read: // we'll return here to reread any columns with out-of-sample type exceptions, or dirty jumps + restartTeam = false; if (verbose) DTPRINT(" jumps=[%d..%d), chunk_size=%llu, total_size=%llu\n", jump0, nJumps, (llu)chunkBytes, (llu)(eof-pos)); ASSERT(allocnrow <= nrowLimit, "allocnrow(%llu) <= nrowLimit(%llu)", (llu)allocnrow, (llu)nrowLimit); @@ -1939,7 +1942,7 @@ int freadMain(freadMainArgs _args) { } size_t myNrow = 0; // the number of rows in my chunk size_t myBuffRows = initialBuffRows; // Upon realloc, myBuffRows will increase to grown capacity - bool myStoppingEarly = false; // true when an empty or too-short or too-long row is encountered when fill=false + bool myStopEarly = false; // true when an empty or too-short line is encountered when fill=false, or too-long row // Allocate thread-private row-major `myBuff`s ThreadLocalFreadParsingContext ctx = { @@ -2122,7 +2125,7 @@ int freadMain(freadMainArgs _args) { if (thisType != joldType) { // rare out-of-sample type exception. if (!checkedNumberOfFields && !fill) { - // check this line has the correct number of fields. If not, don't apply the bump from this invalid line. Instead fall through to myStoppingEarly below. + // check this line has the correct number of fields. If not, don't apply the bump from this invalid line. Instead fall through to myStopEarly below. const char *tt = fieldStart; int fieldsRemaining = countfields(&tt); if (j+fieldsRemaining != ncol) break; @@ -2157,10 +2160,9 @@ int freadMain(freadMainArgs _args) { break; } if (jnextJumpStart) { nSwept++; // next jump landed awkwardly and will be reread from headPos; i.e. next jump is dirty and will be swept - stopTeam = true; + stopTeam = restartTeam = true; jump0 = jump+1; // restart team from next jump. jump0 always starts from headPos // if too many jumps are dirty, scale down to single-threaded to save restarting team too much, wastefully. The file requires // a single threaded read anyway due to its tortuous complexity with so many embedded newlines so often @@ -2251,7 +2262,8 @@ int freadMain(freadMainArgs _args) { extraAllocRows = 0; goto read; } - if (jump0>0 && nSwept>0) { + if (restartTeam) { + ASSERT(nSwept>0 || quoteRuleBumpedCh!=NULL, "Internal error: team restart but nSwept==%d and quoteRuleBumpedCh==%p", nSwept, quoteRuleBumpedCh); goto read; } // else nrowLimit applied and stopped early normally @@ -2295,9 +2307,8 @@ int freadMain(freadMainArgs _args) { // reread from the beginning DTi = 0; headPos = pos; + jump0 = 0; firstTime = false; - nTypeBump = 0; // for test 1328.1. Otherwise the last field would get shifted forwards again. - jump0 = 0; // for #2486 nSwept = 0; goto read; } @@ -2341,6 +2352,8 @@ int freadMain(freadMainArgs _args) { DTi+row1line, ncol, tt, strlim(skippedFooter,500)); } } + } else if (quoteRuleBumpedCh!=NULL) { + DTWARN("Found and resolved improper quoting. First healed line %d: <<%s>>", quoteRuleBumpedLine, strlim(quoteRuleBumpedCh, 500)); } if (verbose) {