diff --git a/Cargo.lock b/Cargo.lock index 18a99ced7..ff3247971 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -24,6 +24,15 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "base64" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)", + "safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "bitflags" version = "1.0.3" @@ -37,6 +46,16 @@ dependencies = [ "simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "byteorder" +version = "1.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "cc" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "cfg-if" version = "0.1.4" @@ -60,6 +79,11 @@ name = "crossbeam" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "dtoa" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "encoding_rs" version = "0.8.4" @@ -71,7 +95,7 @@ dependencies = [ [[package]] name = "encoding_rs_io" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", @@ -123,6 +147,73 @@ dependencies = [ "regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "grep-matcher" +version = "0.0.1" +dependencies = [ + "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "grep-pcre2" +version = "0.0.1" +dependencies = [ + "grep-matcher 0.0.1", + "pcre2 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "grep-printer" +version = "0.0.1" +dependencies = [ + "base64 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)", + "grep-matcher 0.0.1", + "grep-regex 0.0.1", + "grep-searcher 0.0.1", + "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", + "serde_json 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)", + "termcolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "grep-regex" +version = "0.0.1" +dependencies = [ + "grep-matcher 0.0.1", + "log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", + "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "grep-searcher" +version = "0.0.1" +dependencies = [ + "bytecount 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)", + "encoding_rs_io 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)", + "grep-matcher 0.0.1", + "grep-regex 0.0.1", + "log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", + "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", + "regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "grep2" +version = "0.1.8" +dependencies = [ + "grep-matcher 0.0.1", + "grep-printer 0.0.1", + "grep-regex 0.0.1", + "grep-searcher 0.0.1", +] + [[package]] name = "ignore" version = "0.4.3" @@ -135,11 +226,16 @@ dependencies = [ "regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", "tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "walkdir 2.1.4 (registry+https://github.com/rust-lang/crates.io-index)", "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "itoa" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "lazy_static" version = "1.0.2" @@ -183,6 +279,49 @@ dependencies = [ "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "pcre2" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pcre2-sys 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pcre2-sys" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)", + "libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)", + "pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "pkg-config" +version = "0.3.13" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "proc-macro2" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "quote" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "rand" version = "0.4.2" @@ -214,7 +353,7 @@ dependencies = [ "aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)", "memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)", "regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", - "thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", + "thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)", "utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] @@ -258,6 +397,11 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "safemem" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" + [[package]] name = "same-file" version = "1.0.2" @@ -266,6 +410,31 @@ dependencies = [ "winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "serde" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" + +[[package]] +name = "serde_derive" +version = "1.0.70" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)", +] + +[[package]] +name = "serde_json" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)", + "itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "simd" version = "0.2.2" @@ -276,6 +445,16 @@ name = "strsim" version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" +[[package]] +name = "syn" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)", + "quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)", + "unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "tempdir" version = "0.3.7" @@ -313,11 +492,10 @@ dependencies = [ [[package]] name = "thread_local" -version = "0.3.5" +version = "0.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", - "unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -331,23 +509,15 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] -name = "unreachable" -version = "1.0.0" +name = "unicode-xid" +version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -dependencies = [ - "void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)", -] [[package]] name = "utf8-ranges" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -[[package]] -name = "void" -version = "1.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" - [[package]] name = "walkdir" version = "2.1.4" @@ -388,42 +558,56 @@ dependencies = [ "checksum aho-corasick 0.6.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c1c6d463cbe7ed28720b5b489e7c083eeb8f90d08be2a0d6bb9e1ffea9ce1afa" "checksum ansi_term 0.11.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ee49baf6cb617b853aa8d93bf420db2383fab46d314482ca2803b40d5fde979b" "checksum atty 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "9a7d5b8723950951411ee34d271d99dddcc2035a16ab25310ea2c8cfd4369652" +"checksum base64 0.9.2 (registry+https://github.com/rust-lang/crates.io-index)" = "85415d2594767338a74a30c1d370b2f3262ec1b4ed2d7bba5b3faf4de40467d9" "checksum bitflags 1.0.3 (registry+https://github.com/rust-lang/crates.io-index)" = "d0c54bb8f454c567f21197eefcdbf5679d0bd99f2ddbe52e84c77061952e6789" "checksum bytecount 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f861d9ce359f56dbcb6e0c2a1cb84e52ad732cadb57b806adeb3c7668caccbd8" +"checksum byteorder 1.2.3 (registry+https://github.com/rust-lang/crates.io-index)" = "74c0b906e9446b0a2e4f760cdb3fa4b2c48cdc6db8766a845c54b6ff063fd2e9" +"checksum cc 1.0.18 (registry+https://github.com/rust-lang/crates.io-index)" = "2119ea4867bd2b8ed3aecab467709720b2d55b1bcfe09f772fd68066eaf15275" "checksum cfg-if 0.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "efe5c877e17a9c717a0bf3613b2709f723202c4e4675cc8f12926ded29bcb17e" "checksum clap 2.32.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b957d88f4b6a63b9d70d5f454ac8011819c6efa7727858f458ab71c756ce2d3e" "checksum crossbeam 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "24ce9782d4d5c53674646a6a4c1863a21a8fc0cb649b3c94dfc16e45071dea19" +"checksum dtoa 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "6d301140eb411af13d3115f9a562c85cc6b541ade9dfa314132244aaee7489dd" "checksum encoding_rs 0.8.4 (registry+https://github.com/rust-lang/crates.io-index)" = "88a1b66a0d28af4b03a8c8278c6dcb90e6e600d89c14500a9e7a02e64b9ee3ac" -"checksum encoding_rs_io 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "ad0ffe753ba194ef1bc070e8d61edaadb1536c05e364fc9178ca6cbde10922c4" +"checksum encoding_rs_io 0.1.2 (registry+https://github.com/rust-lang/crates.io-index)" = "f222ff554d6e172f3569a2d7d0fd8061d54215984ef67b24ce031c1fcbf2c9b3" "checksum fnv 1.0.6 (registry+https://github.com/rust-lang/crates.io-index)" = "2fad85553e09a6f881f739c29f0b00b0f01357c743266d478b68951ce23285f3" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum glob 0.2.11 (registry+https://github.com/rust-lang/crates.io-index)" = "8be18de09a56b60ed0edf84bc9df007e30040691af7acd1c41874faac5895bfb" +"checksum itoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5adb58558dcd1d786b5f0bd15f3226ee23486e24b7b58304b60f64dc68e62606" "checksum lazy_static 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "fb497c35d362b6a331cfd94956a07fc2c78a4604cdbee844a81170386b996dd3" "checksum libc 0.2.42 (registry+https://github.com/rust-lang/crates.io-index)" = "b685088df2b950fccadf07a7187c8ef846a959c142338a48f9dc0b94517eb5f1" "checksum log 0.4.3 (registry+https://github.com/rust-lang/crates.io-index)" = "61bd98ae7f7b754bc53dca7d44b604f733c6bba044ea6f41bc8d89272d8161d2" "checksum memchr 2.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "796fba70e76612589ed2ce7f45282f5af869e0fdd7cc6199fa1aa1f1d591ba9d" "checksum memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "e2ffa2c986de11a9df78620c01eeaaf27d94d3ff02bf81bfcca953102dd0c6ff" "checksum num_cpus 1.8.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c51a3322e4bca9d212ad9a158a02abc6934d005490c054a2778df73a70aa0a30" +"checksum pcre2 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "0c16ec0e30c17f938a2da8ff970ad9a4100166d0538898dcc035b55c393cab54" +"checksum pcre2-sys 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c0775a4527177c46e1e128296cc541347cba6a088db2107588172dd6f6ac98c2" +"checksum pkg-config 0.3.13 (registry+https://github.com/rust-lang/crates.io-index)" = "104630aa1c83213cbc76db0703630fcb0421dac3585063be4ce9a8a2feeaa745" +"checksum proc-macro2 0.4.9 (registry+https://github.com/rust-lang/crates.io-index)" = "cccdc7557a98fe98453030f077df7f3a042052fae465bb61d2c2c41435cfd9b6" +"checksum quote 0.6.3 (registry+https://github.com/rust-lang/crates.io-index)" = "e44651a0dc4cdd99f71c83b561e221f714912d11af1a4dff0631f923d53af035" "checksum rand 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "eba5f8cb59cc50ed56be8880a5c7b496bfd9bd26394e176bc67884094145c2c5" "checksum redox_syscall 0.1.40 (registry+https://github.com/rust-lang/crates.io-index)" = "c214e91d3ecf43e9a4e41e578973adeb14b474f2bee858742d127af75a0112b1" "checksum redox_termios 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "7e891cfe48e9100a70a3b6eb652fef28920c117d366339687bd5576160db0f76" "checksum regex 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "5bbbea44c5490a1e84357ff28b7d518b4619a159fed5d25f6c1de2d19cc42814" "checksum regex-syntax 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)" = "747ba3b235651f6e2f67dfa8bcdcd073ddb7c243cb21c442fc12395dfcac212d" "checksum remove_dir_all 0.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "3488ba1b9a2084d38645c4c08276a1752dcbf2c7130d74f1569681ad5d2799c5" +"checksum safemem 0.2.0 (registry+https://github.com/rust-lang/crates.io-index)" = "e27a8b19b835f7aea908818e871f5cc3a5a186550c30773be987e155e8163d8f" "checksum same-file 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "cfb6eded0b06a0b512c8ddbcf04089138c9b4362c2f696f3c3d76039d68f3637" +"checksum serde 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "0c3adf19c07af6d186d91dae8927b83b0553d07ca56cbf7f2f32560455c91920" +"checksum serde_derive 1.0.70 (registry+https://github.com/rust-lang/crates.io-index)" = "3525a779832b08693031b8ecfb0de81cd71cfd3812088fafe9a7496789572124" +"checksum serde_json 1.0.24 (registry+https://github.com/rust-lang/crates.io-index)" = "c3c6908c7b925cd6c590358a4034de93dbddb20c45e1d021931459fd419bf0e2" "checksum simd 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "ed3686dd9418ebcc3a26a0c0ae56deab0681e53fe899af91f5bbcee667ebffb1" "checksum strsim 0.7.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bb4f380125926a99e52bc279241539c018323fab05ad6368b56f93d9369ff550" +"checksum syn 0.14.4 (registry+https://github.com/rust-lang/crates.io-index)" = "2beff8ebc3658f07512a413866875adddd20f4fd47b2a4e6c9da65cd281baaea" "checksum tempdir 0.3.7 (registry+https://github.com/rust-lang/crates.io-index)" = "15f2b5fb00ccdf689e0149d1b1b3c03fead81c2b37735d812fa8bddbbf41b6d8" "checksum termcolor 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "722426c4a0539da2c4ffd9b419d90ad540b4cff4a053be9069c908d4d07e2836" "checksum termion 1.5.1 (registry+https://github.com/rust-lang/crates.io-index)" = "689a3bdfaab439fd92bc87df5c4c78417d3cbe537487274e9b0b2dce76e92096" "checksum textwrap 0.10.0 (registry+https://github.com/rust-lang/crates.io-index)" = "307686869c93e71f94da64286f9a9524c0f308a9e1c87a583de8e9c9039ad3f6" -"checksum thread_local 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "279ef31c19ededf577bfd12dfae728040a21f635b06a24cd670ff510edd38963" +"checksum thread_local 0.3.6 (registry+https://github.com/rust-lang/crates.io-index)" = "c6b53e329000edc2b34dbe8545fd20e55a333362d0a321909685a19bd28c3f1b" "checksum ucd-util 0.1.1 (registry+https://github.com/rust-lang/crates.io-index)" = "fd2be2d6639d0f8fe6cdda291ad456e23629558d466e2789d2c3e9892bda285d" "checksum unicode-width 0.1.5 (registry+https://github.com/rust-lang/crates.io-index)" = "882386231c45df4700b275c7ff55b6f3698780a650026380e72dabe76fa46526" -"checksum unreachable 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "382810877fe448991dfc7f0dd6e3ae5d58088fd0ea5e35189655f84e6814fa56" +"checksum unicode-xid 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fc72304796d0818e357ead4e000d19c9c174ab23dc11093ac919054d20a6a7fc" "checksum utf8-ranges 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "662fab6525a98beff2921d7f61a39e7d59e0b425ebc7d0d9e66d316e55124122" -"checksum void 1.0.2 (registry+https://github.com/rust-lang/crates.io-index)" = "6a02e4885ed3bc0f2de90ea6dd45ebcbb66dacffe03547fadbb0eeae2770887d" "checksum walkdir 2.1.4 (registry+https://github.com/rust-lang/crates.io-index)" = "63636bd0eb3d00ccb8b9036381b526efac53caf112b7783b730ab3f8e44da369" "checksum winapi 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "773ef9dcc5f24b7d850d0ff101e542ff24c3b090a9768e03ff889fdef41f00fd" "checksum winapi-i686-pc-windows-gnu 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" diff --git a/Cargo.toml b/Cargo.toml index ffa253f09..ec6ee1b5f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -32,7 +32,17 @@ name = "integration" path = "tests/tests.rs" [workspace] -members = ["grep", "globset", "ignore"] +members = [ + "globset", + "grep", + "grep2", + "grep-matcher", + "grep-pcre2", + "grep-printer", + "grep-regex", + "grep-searcher", + "ignore", +] [dependencies] atty = "0.2.11" @@ -72,10 +82,12 @@ features = ["suggestions", "color"] [features] avx-accel = [ "bytecount/avx-accel", + "grep2/avx-accel", ] simd-accel = [ "bytecount/simd-accel", "encoding_rs/simd-accel", + "grep2/simd-accel", ] [profile.release] diff --git a/grep-matcher/Cargo.toml b/grep-matcher/Cargo.toml new file mode 100644 index 000000000..8056dec15 --- /dev/null +++ b/grep-matcher/Cargo.toml @@ -0,0 +1,24 @@ +[package] +name = "grep-matcher" +version = "0.0.1" #:version +authors = ["Andrew Gallant "] +description = """ +A trait for regular expressions, with a focus on line oriented search. +""" +documentation = "https://docs.rs/grep-matcher" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "pattern", "trait"] +license = "Unlicense/MIT" +autotests = false + +[dependencies] +memchr = "2" + +[dev-dependencies] +regex = "1" + +[[test]] +name = "integration" +path = "tests/tests.rs" diff --git a/grep-matcher/LICENSE-MIT b/grep-matcher/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep-matcher/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-matcher/README.md b/grep-matcher/README.md new file mode 100644 index 000000000..f83ceade4 --- /dev/null +++ b/grep-matcher/README.md @@ -0,0 +1,36 @@ +grep-matcher +------------ +This crate provides a low level interface for describing regular expression +matchers. The `grep` crate uses this interface in order to make the regex +engine it uses pluggable. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-matcher.svg)](https://crates.io/crates/grep-matcher) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-matcher](https://docs.rs/grep-matcher) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-matcher = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_matcher; +``` diff --git a/grep-matcher/UNLICENSE b/grep-matcher/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep-matcher/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-matcher/src/interpolate.rs b/grep-matcher/src/interpolate.rs new file mode 100644 index 000000000..168dd3438 --- /dev/null +++ b/grep-matcher/src/interpolate.rs @@ -0,0 +1,328 @@ +use std::str; + +use memchr::memchr; + +/// Interpolate capture references in `replacement` and write the interpolation +/// result to `dst`. References in `replacement` take the form of $N or $name, +/// where `N` is a capture group index and `name` is a capture group name. The +/// function provided, `name_to_index`, maps capture group names to indices. +/// +/// The `append` function given is responsible for writing the replacement +/// to the `dst` buffer. That is, it is called with the capture group index +/// of a capture group reference and is expected to resolve the index to its +/// corresponding matched text. If no such match exists, then `append` should +/// not write anything to its given buffer. +pub fn interpolate( + mut replacement: &[u8], + mut append: A, + mut name_to_index: N, + dst: &mut Vec, +) where + A: FnMut(usize, &mut Vec), + N: FnMut(&str) -> Option +{ + while !replacement.is_empty() { + match memchr(b'$', replacement) { + None => break, + Some(i) => { + dst.extend(&replacement[..i]); + replacement = &replacement[i..]; + } + } + if replacement.get(1).map_or(false, |&b| b == b'$') { + dst.push(b'$'); + replacement = &replacement[2..]; + continue; + } + debug_assert!(!replacement.is_empty()); + let cap_ref = match find_cap_ref(replacement) { + Some(cap_ref) => cap_ref, + None => { + dst.push(b'$'); + replacement = &replacement[1..]; + continue; + } + }; + replacement = &replacement[cap_ref.end..]; + match cap_ref.cap { + Ref::Number(i) => append(i, dst), + Ref::Named(name) => { + if let Some(i) = name_to_index(name) { + append(i, dst); + } + } + } + } + dst.extend(replacement); +} + +/// `CaptureRef` represents a reference to a capture group inside some text. +/// The reference is either a capture group name or a number. +/// +/// It is also tagged with the position in the text immediately proceding the +/// capture reference. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct CaptureRef<'a> { + cap: Ref<'a>, + end: usize, +} + +/// A reference to a capture group in some text. +/// +/// e.g., `$2`, `$foo`, `${foo}`. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +enum Ref<'a> { + Named(&'a str), + Number(usize), +} + +impl<'a> From<&'a str> for Ref<'a> { + fn from(x: &'a str) -> Ref<'a> { + Ref::Named(x) + } +} + +impl From for Ref<'static> { + fn from(x: usize) -> Ref<'static> { + Ref::Number(x) + } +} + +/// Parses a possible reference to a capture group name in the given text, +/// starting at the beginning of `replacement`. +/// +/// If no such valid reference could be found, None is returned. +fn find_cap_ref(replacement: &[u8]) -> Option { + let mut i = 0; + if replacement.len() <= 1 || replacement[0] != b'$' { + return None; + } + let mut brace = false; + i += 1; + if replacement[i] == b'{' { + brace = true; + i += 1; + } + let mut cap_end = i; + while replacement.get(cap_end).map_or(false, is_valid_cap_letter) { + cap_end += 1; + } + if cap_end == i { + return None; + } + // We just verified that the range 0..cap_end is valid ASCII, so it must + // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 + // check with an unchecked conversion or by parsing the number straight + // from &[u8]. + let cap = str::from_utf8(&replacement[i..cap_end]) + .expect("valid UTF-8 capture name"); + if brace { + if !replacement.get(cap_end).map_or(false, |&b| b == b'}') { + return None; + } + cap_end += 1; + } + Some(CaptureRef { + cap: match cap.parse::() { + Ok(i) => Ref::Number(i as usize), + Err(_) => Ref::Named(cap), + }, + end: cap_end, + }) +} + +/// Returns true if and only if the given byte is allowed in a capture name. +fn is_valid_cap_letter(b: &u8) -> bool { + match *b { + b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' | b'_' => true, + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::{CaptureRef, find_cap_ref, interpolate}; + + macro_rules! find { + ($name:ident, $text:expr) => { + #[test] + fn $name() { + assert_eq!(None, find_cap_ref($text.as_bytes())); + } + }; + ($name:ident, $text:expr, $capref:expr) => { + #[test] + fn $name() { + assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); + } + }; + } + + macro_rules! c { + ($name_or_number:expr, $pos:expr) => { + CaptureRef { cap: $name_or_number.into(), end: $pos } + }; + } + + find!(find_cap_ref1, "$foo", c!("foo", 4)); + find!(find_cap_ref2, "${foo}", c!("foo", 6)); + find!(find_cap_ref3, "$0", c!(0, 2)); + find!(find_cap_ref4, "$5", c!(5, 2)); + find!(find_cap_ref5, "$10", c!(10, 3)); + find!(find_cap_ref6, "$42a", c!("42a", 4)); + find!(find_cap_ref7, "${42}a", c!(42, 5)); + find!(find_cap_ref8, "${42"); + find!(find_cap_ref9, "${42 "); + find!(find_cap_ref10, " $0 "); + find!(find_cap_ref11, "$"); + find!(find_cap_ref12, " "); + find!(find_cap_ref13, ""); + + // A convenience routine for using interpolate's unwieldy but flexible API. + fn interpolate_string( + mut name_to_index: Vec<(&'static str, usize)>, + caps: Vec<&'static str>, + replacement: &str, + ) -> String { + name_to_index.sort_by_key(|x| x.0); + + let mut dst = vec![]; + interpolate( + replacement.as_bytes(), + |i, dst| { + if let Some(&s) = caps.get(i) { + dst.extend(s.as_bytes()); + } + }, + |name| -> Option { + name_to_index + .binary_search_by_key(&name, |x| x.0) + .ok() + .map(|i| name_to_index[i].1) + }, + &mut dst, + ); + String::from_utf8(dst).unwrap() + } + + macro_rules! interp { + ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { + #[test] + fn $name() { + assert_eq!($expected, interpolate_string($map, $caps, $hay)); + } + } + } + + interp!( + interp1, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo test", + "test xxx test", + ); + + interp!( + interp2, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$footest", + "test", + ); + + interp!( + interp3, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${foo}test", + "testxxxtest", + ); + + interp!( + interp4, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test$2test", + "test", + ); + + interp!( + interp5, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test${2}test", + "testxxxtest", + ); + + interp!( + interp6, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $$foo test", + "test $foo test", + ); + + interp!( + interp7, + vec![("foo", 2)], + vec!["", "", "xxx"], + "test $foo", + "test xxx", + ); + + interp!( + interp8, + vec![("foo", 2)], + vec!["", "", "xxx"], + "$foo test", + "xxx test", + ); + + interp!( + interp9, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $bar$foo", + "test yyyxxx", + ); + + interp!( + interp10, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test $ test", + "test $ test", + ); + + interp!( + interp11, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${} test", + "test ${} test", + ); + + interp!( + interp12, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${ } test", + "test ${ } test", + ); + + interp!( + interp13, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a b} test", + "test ${a b} test", + ); + + interp!( + interp14, + vec![("bar", 1), ("foo", 2)], + vec!["", "yyy", "xxx"], + "test ${a} test", + "test test", + ); +} diff --git a/grep-matcher/src/lib.rs b/grep-matcher/src/lib.rs new file mode 100644 index 000000000..49d6358f8 --- /dev/null +++ b/grep-matcher/src/lib.rs @@ -0,0 +1,1126 @@ +/*! +This crate provides an interface for regular expressions, with a focus on line +oriented search. The purpose of this crate is to provide a low level matching +interface that permits any kind of substring or regex implementation to power +the search routines provided by the +[`grep-searcher`](https://docs.rs/grep-searcher) +crate. + +The primary thing provided by this crate is the +[`Matcher`](trait.Matcher.html) +trait. The trait defines an abstract interface for text search. It is robust +enough to support everything from basic substring search all the way to +arbitrarily complex regular expression implementations without sacrificing +performance. + +A key design decision made in this crate is the use of *internal iteration*, +or otherwise known as the "push" model of searching. In this paradigm, +implementations of the `Matcher` trait will drive search and execute callbacks +provided by the caller when a match is found. This is in contrast to the +usual style of *external iteration* (the "pull" model) found throughout the +Rust ecosystem. There are two primary reasons why internal iteration was +chosen: + +* Some search implementations may themselves require internal iteration. + Converting an internal iterator to an external iterator can be non-trivial + and sometimes even practically impossible. +* Rust's type system isn't quite expressive enough to write a generic interface + using external iteration without giving something else up (namely, ease of + use and/or performance). + +In other words, internal iteration was chosen because it is the lowest common +denominator and because it is probably the least bad way of expressing the +interface in today's Rust. As a result, this trait isn't specifically intended +for everyday use, although, you might find it to be a happy price to pay if you +want to write code that is generic over multiple different regex +implementations. +*/ + +#![deny(missing_docs)] + +extern crate memchr; + +use std::fmt; +use std::io; +use std::ops; +use std::u64; + +use interpolate::interpolate; + +mod interpolate; + +/// The type of a match. +/// +/// The type of a match is a possibly empty range pointing to a contiguous +/// block of addressable memory. +/// +/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`. +/// +/// # Indexing +/// +/// This type is structurally identical to `std::ops::Range`, but +/// is a bit more ergonomic for dealing with match indices. In particular, +/// this type implements `Copy` and provides methods for building new `Match` +/// values based on old `Match` values. Finally, the invariant that `start` +/// is always less than or equal to `end` is enforced. +/// +/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using +/// range notation. e.g., +/// +/// ``` +/// use grep_matcher::Match; +/// +/// let m = Match::new(2, 5); +/// let bytes = b"abcdefghi"; +/// assert_eq!(b"cde", &bytes[m]); +/// ``` +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct Match { + start: usize, + end: usize, +} + +impl Match { + /// Create a new match. + /// + /// # Panics + /// + /// This function panics if `start > end`. + #[inline] + pub fn new(start: usize, end: usize) -> Match { + assert!(start <= end); + Match { start, end } + } + + /// Creates a zero width match at the given offset. + #[inline] + pub fn zero(offset: usize) -> Match { + Match { start: offset, end: offset } + } + + /// Return the start offset of this match. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Return the end offset of this match. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Return a new match with the start offset replaced with the given + /// value. + /// + /// # Panics + /// + /// This method panics if `start > self.end`. + #[inline] + pub fn with_start(&self, start: usize) -> Match { + assert!(start <= self.end); + Match { start, ..*self } + } + + /// Return a new match with the end offset replaced with the given + /// value. + /// + /// # Panics + /// + /// This method panics if `self.start > end`. + #[inline] + pub fn with_end(&self, end: usize) -> Match { + assert!(self.start <= end); + Match { end, ..*self } + } + + /// Offset this match by the given amount and return a new match. + /// + /// This adds the given offset to the start and end of this match, and + /// returns the resulting match. + /// + /// # Panics + /// + /// This panics if adding the given amount to either the start or end + /// offset would result in an overflow. + #[inline] + pub fn offset(&self, amount: usize) -> Match { + Match { + start: self.start.checked_add(amount).unwrap(), + end: self.end.checked_add(amount).unwrap(), + } + } + + /// Returns the number of bytes in this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns true if and only if this match is empty. + #[inline] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } +} + +impl ops::Index for [u8] { + type Output = [u8]; + + #[inline] + fn index(&self, index: Match) -> &[u8] { + &self[index.start..index.end] + } +} + +impl ops::IndexMut for [u8] { + #[inline] + fn index_mut(&mut self, index: Match) -> &mut [u8] { + &mut self[index.start..index.end] + } +} + +impl ops::Index for str { + type Output = str; + + #[inline] + fn index(&self, index: Match) -> &str { + &self[index.start..index.end] + } +} + +/// A line terminator. +/// +/// A line terminator represents the end of a line. Generally, every line is +/// either "terminated" by the end of a stream or a specific byte (or sequence +/// of bytes). +/// +/// Generally, a line terminator is a single byte, specifically, `\n`, on +/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to +/// as `CRLF` for `Carriage Return; Line Feed`). +/// +/// The default line terminator is `\n` on all platforms. +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +pub struct LineTerminator(LineTerminatorImp); + +#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] +enum LineTerminatorImp { + /// Any single byte representing a line terminator. + /// + /// We represent this as an array so we can safely convert it to a slice + /// for convenient access. At some point, we can use `std::slice::from_ref` + /// instead. + Byte([u8; 1]), + /// A line terminator represented by `\r\n`. + /// + /// When this option is used, consumers may generally treat a lone `\n` as + /// a line terminator in addition to `\r\n`. + CRLF, +} + +impl LineTerminator { + /// Return a new single-byte line terminator. Any byte is valid. + #[inline] + pub fn byte(byte: u8) -> LineTerminator { + LineTerminator(LineTerminatorImp::Byte([byte])) + } + + /// Return a new line terminator represented by `\r\n`. + /// + /// When this option is used, consumers may generally treat a lone `\n` as + /// a line terminator in addition to `\r\n`. + #[inline] + pub fn crlf() -> LineTerminator { + LineTerminator(LineTerminatorImp::CRLF) + } + + /// Returns true if and only if this line terminator is CRLF. + #[inline] + pub fn is_crlf(&self) -> bool { + self.0 == LineTerminatorImp::CRLF + } + + /// Returns this line terminator as a single byte. + /// + /// If the line terminator is CRLF, then this returns `\n`. This is + /// useful for routines that, for example, find line boundaries by treating + /// `\n` as a line terminator even when it isn't preceded by `\r`. + #[inline] + pub fn as_byte(&self) -> u8 { + match self.0 { + LineTerminatorImp::Byte(array) => array[0], + LineTerminatorImp::CRLF => b'\n', + } + } + + /// Returns this line terminator as a sequence of bytes. + /// + /// This returns a singleton sequence for all line terminators except for + /// `CRLF`, in which case, it returns `\r\n`. + /// + /// The slice returned is guaranteed to have length at least `1`. + #[inline] + pub fn as_bytes(&self) -> &[u8] { + match self.0 { + LineTerminatorImp::Byte(ref array) => array, + LineTerminatorImp::CRLF => &[b'\r', b'\n'], + } + } +} + +impl Default for LineTerminator { + #[inline] + fn default() -> LineTerminator { + LineTerminator::byte(b'\n') + } +} + +/// A set of bytes. +/// +/// In this crate, byte sets are used to express bytes that can never appear +/// anywhere in a match for a particular implementation of the `Matcher` trait. +/// Specifically, if such a set can be determined, then it's possible for +/// callers to perform additional operations on the basis that certain bytes +/// may never match. +/// +/// For example, if a search is configured to possibly produce results that +/// span multiple lines but a caller provided pattern can never match across +/// multiple lines, then it may make sense to divert to more optimized line +/// oriented routines that don't need to handle the multi-line match case. +#[derive(Clone, Debug)] +pub struct ByteSet(BitSet); + +#[derive(Clone, Copy)] +struct BitSet([u64; 4]); + +impl fmt::Debug for BitSet { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let mut fmtd = f.debug_set(); + for b in (0..256).map(|b| b as u8) { + if ByteSet(*self).contains(b) { + fmtd.entry(&b); + } + } + fmtd.finish() + } +} + +impl ByteSet { + /// Create an empty set of bytes. + pub fn empty() -> ByteSet { + ByteSet(BitSet([0; 4])) + } + + /// Create a full set of bytes such that every possible byte is in the set + /// returned. + pub fn full() -> ByteSet { + ByteSet(BitSet([u64::MAX; 4])) + } + + /// Add a byte to this set. + /// + /// If the given byte already belongs to this set, then this is a no-op. + pub fn add(&mut self, byte: u8) { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] |= 1 << bit; + } + + /// Add an inclusive range of bytes. + pub fn add_all(&mut self, start: u8, end: u8) { + for b in (start as u64..end as u64 + 1).map(|b| b as u8) { + self.add(b); + } + } + + /// Remove a byte from this set. + /// + /// If the given byte is not in this set, then this is a no-op. + pub fn remove(&mut self, byte: u8) { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] &= !(1 << bit); + } + + /// Remove an inclusive range of bytes. + pub fn remove_all(&mut self, start: u8, end: u8) { + for b in (start as u64..end as u64 + 1).map(|b| b as u8) { + self.remove(b); + } + } + + /// Return true if and only if the given byte is in this set. + pub fn contains(&self, byte: u8) -> bool { + let bucket = byte / 64; + let bit = byte % 64; + (self.0).0[bucket as usize] & (1 << bit) > 0 + } +} + +/// A trait that describes implementations of capturing groups. +/// +/// When a matcher supports capturing group extraction, then it is the +/// matcher's responsibility to provide an implementation of this trait. +/// +/// Principally, this trait provides a way to access capturing groups +/// in a uniform way that does not require any specific representation. +/// Namely, different matcher implementations may require different in-memory +/// representations of capturing groups. This trait permits matchers to +/// maintain their specific in-memory representation. +/// +/// Note that this trait explicitly does not provide a way to construct a new +/// capture value. Instead, it is the responsibility of a `Matcher` to build +/// one, which might require knowledge of the matcher's internal implementation +/// details. +pub trait Captures { + /// Return the total number of capturing groups. This includes capturing + /// groups that have not matched anything. + fn len(&self) -> usize; + + /// Return the capturing group match at the given index. If no match of + /// that capturing group exists, then this returns `None`. + /// + /// When a matcher reports a match with capturing groups, then the first + /// capturing group (at index `0`) must always correspond to the offsets + /// for the overall match. + fn get(&self, i: usize) -> Option; + + /// Returns true if and only if these captures are empty. This occurs + /// when `len` is `0`. + /// + /// Note that capturing groups that have non-zero length but otherwise + /// contain no matching groups are *not* empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } + + /// Expands all instances of `$name` in `replacement` to the corresponding + /// capture group `name`, and writes them to the `dst` buffer given. + /// + /// (Note: If you're looking for a convenient way to perform replacements + /// with interpolation, then you'll want to use the `replace_with_captures` + /// method on the `Matcher` trait.) + /// + /// `name` may be an integer corresponding to the index of the + /// capture group (counted by order of opening parenthesis where `0` is the + /// entire match) or it can be a name (consisting of letters, digits or + /// underscores) corresponding to a named capture group. + /// + /// A `name` is translated to a capture group index via the given + /// `name_to_index` function. If `name` isn't a valid capture group + /// (whether the name doesn't exist or isn't a valid index), then it is + /// replaced with the empty string. + /// + /// The longest possible name is used. e.g., `$1a` looks up the capture + /// group named `1a` and not the capture group at index `1`. To exert + /// more precise control over the name, use braces, e.g., `${1}a`. In all + /// cases, capture group names are limited to ASCII letters, numbers and + /// underscores. + /// + /// To write a literal `$` use `$$`. + /// + /// Note that the capture group match indices are resolved by slicing + /// the given `haystack`. Generally, this means that `haystack` should be + /// the same slice that was searched to get the current capture group + /// matches. + fn interpolate( + &self, + name_to_index: F, + haystack: &[u8], + replacement: &[u8], + dst: &mut Vec, + ) where F: FnMut(&str) -> Option + { + interpolate( + replacement, + |i, dst| { + if let Some(range) = self.get(i) { + dst.extend(&haystack[range]); + } + }, + name_to_index, + dst, + ) + } +} + +/// NoCaptures provides an always-empty implementation of the `Captures` trait. +/// +/// This type is useful for implementations of `Matcher` that don't support +/// capturing groups. +#[derive(Clone, Debug)] +pub struct NoCaptures(()); + +impl NoCaptures { + /// Create an empty set of capturing groups. + pub fn new() -> NoCaptures { NoCaptures(()) } +} + +impl Captures for NoCaptures { + fn len(&self) -> usize { 0 } + fn get(&self, _: usize) -> Option { None } +} + +/// NoError provides an error type for matchers that never produce errors. +/// +/// This error type implements the `std::error::Error` and `fmt::Display` +/// traits for use in matcher implementations that can never produce errors. +/// +/// The `fmt::Debug` and `fmt::Display` impls for this type panics. +#[derive(Debug, Eq, PartialEq)] +pub struct NoError(()); + +impl ::std::error::Error for NoError { + fn description(&self) -> &str { "no error" } +} + +impl fmt::Display for NoError { + fn fmt(&self, _: &mut fmt::Formatter) -> fmt::Result { + panic!("BUG for NoError: an impossible error occurred") + } +} + +impl From for io::Error { + fn from(_: NoError) -> io::Error { + panic!("BUG for NoError: an impossible error occurred") + } +} + +/// The type of match for a line oriented matcher. +#[derive(Clone, Copy, Debug)] +pub enum LineMatchKind { + /// A position inside a line that is known to contain a match. + /// + /// This position can be anywhere in the line. It does not need to point + /// at the location of the match. + Confirmed(usize), + /// A position inside a line that may contain a match, and must be searched + /// for verification. + /// + /// This position can be anywhere in the line. It does not need to point + /// at the location of the match. + Candidate(usize), +} + +/// A matcher defines an interface for regular expression implementations. +/// +/// While this trait is large, there are only two required methods that +/// implementors must provide: `find_at` and `new_captures`. If captures +/// aren't supported by your implementation, then `new_captures` can be +/// implemented with +/// [`NoCaptures`](struct.NoCaptures.html). If your implementation does support +/// capture groups, then you should also implement the other capture related +/// methods, as dictated by the documentation. Crucially, this includes +/// `captures_at`. +/// +/// The rest of the methods on this trait provide default implementations on +/// top of `find_at` and `new_captures`. It is not uncommon for implementations +/// to be able to provide faster variants of some methods; in those cases, +/// simply override the default implementation. +pub trait Matcher { + /// The concrete type of capturing groups used for this matcher. + /// + /// If this implementation does not support capturing groups, then set + /// this to `NoCaptures`. + type Captures: Captures; + + /// The error type used by this matcher. + /// + /// For matchers in which an error is not possible, they are encouraged to + /// use the `NoError` type in this crate. In the future, when the "never" + /// (spelled `!`) type is stabilized, then it should probably be used + /// instead. + type Error: fmt::Display; + + /// Returns the start and end byte range of the first match in `haystack` + /// after `at`, where the byte offsets are relative to that start of + /// `haystack` (and not `at`). If no match exists, then `None` is returned. + /// + /// The text encoding of `haystack` is not strictly specified. Matchers are + /// advised to assume UTF-8, or at worst, some ASCII compatible encoding. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, Self::Error>; + + /// Creates an empty group of captures suitable for use with the capturing + /// APIs of this trait. + /// + /// Implementations that don't support capturing groups should use + /// the `NoCaptures` type and implement this method by calling + /// `NoCaptures::new()`. + fn new_captures(&self) -> Result; + + /// Returns the total number of capturing groups in this matcher. + /// + /// If a matcher supports capturing groups, then this value must always be + /// at least 1, where the first capturing group always corresponds to the + /// overall match. + /// + /// If a matcher does not support capturing groups, then this should + /// always return 0. + /// + /// By default, capturing groups are not supported, so this always + /// returns 0. + fn capture_count(&self) -> usize { + 0 + } + + /// Maps the given capture group name to its corresponding capture group + /// index, if one exists. If one does not exist, then `None` is returned. + /// + /// If the given capture group name maps to multiple indices, then it is + /// not specified which one is returned. However, it is guaranteed that + /// one of them is returned. + /// + /// By default, capturing groups are not supported, so this always returns + /// `None`. + fn capture_index(&self, _name: &str) -> Option { + None + } + + /// Returns the start and end byte range of the first match in `haystack`. + /// If no match exists, then `None` is returned. + /// + /// The text encoding of `haystack` is not strictly specified. Matchers are + /// advised to assume UTF-8, or at worst, some ASCII compatible encoding. + fn find( + &self, + haystack: &[u8], + ) -> Result, Self::Error> { + self.find_at(haystack, 0) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack`. If no match exists, then the given function is never + /// called. If the function returns `false`, then iteration stops. + fn find_iter( + &self, + haystack: &[u8], + mut matched: F, + ) -> Result<(), Self::Error> + where F: FnMut(Match) -> bool + { + self.try_find_iter(haystack, |m| Ok(matched(m))) + .map(|r: Result<(), ()>| r.unwrap()) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack`. If no match exists, then the given function is never + /// called. If the function returns `false`, then iteration stops. + /// Similarly, if the function returns an error then iteration stops and + /// the error is yielded. If an error occurs while executing the search, + /// then it is converted to + /// `E`. + fn try_find_iter( + &self, + haystack: &[u8], + mut matched: F, + ) -> Result, Self::Error> + where F: FnMut(Match) -> Result + { + let mut last_end = 0; + let mut last_match = None; + + loop { + if last_end > haystack.len() { + return Ok(Ok(())); + } + let m = match self.find_at(haystack, last_end)? { + None => return Ok(Ok(())), + Some(m) => m, + }; + if m.start == m.end { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + last_end = m.end + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end) == last_match { + continue; + } + } else { + last_end = m.end; + } + last_match = Some(m.end); + match matched(m) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } + } + } + + /// Populates the first set of capture group matches from `haystack` into + /// `caps`. If no match exists, then `false` is returned. + /// + /// The text encoding of `haystack` is not strictly specified. Matchers are + /// advised to assume UTF-8, or at worst, some ASCII compatible encoding. + fn captures( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + ) -> Result { + self.captures_at(haystack, 0, caps) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack` with capture groups extracted from each match. If no + /// match exists, then the given function is never called. If the function + /// returns `false`, then iteration stops. + fn captures_iter( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + mut matched: F, + ) -> Result<(), Self::Error> + where F: FnMut(&Self::Captures) -> bool + { + self.try_captures_iter(haystack, caps, |caps| Ok(matched(caps))) + .map(|r: Result<(), ()>| r.unwrap()) + } + + /// Executes the given function over successive non-overlapping matches + /// in `haystack` with capture groups extracted from each match. If no + /// match exists, then the given function is never called. If the function + /// returns `false`, then iteration stops. Similarly, if the function + /// returns an error then iteration stops and the error is yielded. If + /// an error occurs while executing the search, then it is converted to + /// `E`. + fn try_captures_iter( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + mut matched: F, + ) -> Result, Self::Error> + where F: FnMut(&Self::Captures) -> Result + { + let mut last_end = 0; + let mut last_match = None; + + loop { + if last_end > haystack.len() { + return Ok(Ok(())); + } + if !self.captures_at(haystack, last_end, caps)? { + return Ok(Ok(())); + } + let m = caps.get(0).unwrap(); + if m.start == m.end { + // This is an empty match. To ensure we make progress, start + // the next search at the smallest possible starting position + // of the next match following this one. + last_end = m.end + 1; + // Don't accept empty matches immediately following a match. + // Just move on to the next match. + if Some(m.end) == last_match { + continue; + } + } else { + last_end = m.end; + } + last_match = Some(m.end); + match matched(caps) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } + } + } + + /// Populates the first set of capture group matches from `haystack` + /// into `matches` after `at`, where the byte offsets in each capturing + /// group are relative to the start of `haystack` (and not `at`). If no + /// match exists, then `false` is returned and the contents of the given + /// capturing groups are unspecified. + /// + /// The text encoding of `haystack` is not strictly specified. Matchers are + /// advised to assume UTF-8, or at worst, some ASCII compatible encoding. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + /// + /// By default, capturing groups aren't supported, and this implementation + /// will always behave as if a match were impossible. + /// + /// Implementors that provide support for capturing groups must guarantee + /// that when a match occurs, the first capture match (at index `0`) is + /// always set to the overall match offsets. + /// + /// Note that if implementors seek to support capturing groups, then they + /// should implement this method. Other methods that match based on + /// captures will then work automatically. + fn captures_at( + &self, + _haystack: &[u8], + _at: usize, + _caps: &mut Self::Captures, + ) -> Result { + Ok(false) + } + + /// Replaces every match in the given haystack with the result of calling + /// `append`. `append` is given the start and end of a match, along with + /// a handle to the `dst` buffer provided. + /// + /// If the given `append` function returns `false`, then replacement stops. + fn replace( + &self, + haystack: &[u8], + dst: &mut Vec, + mut append: F, + ) -> Result<(), Self::Error> + where F: FnMut(Match, &mut Vec) -> bool + { + let mut last_match = 0; + self.find_iter(haystack, |m| { + dst.extend(&haystack[last_match..m.start]); + last_match = m.end; + append(m, dst) + })?; + dst.extend(&haystack[last_match..]); + Ok(()) + } + + /// Replaces every match in the given haystack with the result of calling + /// `append` with the matching capture groups. + /// + /// If the given `append` function returns `false`, then replacement stops. + fn replace_with_captures( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + dst: &mut Vec, + mut append: F, + ) -> Result<(), Self::Error> + where F: FnMut(&Self::Captures, &mut Vec) -> bool + { + let mut last_match = 0; + self.captures_iter(haystack, caps, |caps| { + let m = caps.get(0).unwrap(); + dst.extend(&haystack[last_match..m.start]); + last_match = m.end; + append(caps, dst) + })?; + dst.extend(&haystack[last_match..]); + Ok(()) + } + + /// Returns true if and only if the matcher matches the given haystack. + /// + /// By default, this method is implemented by calling `shortest_match`. + fn is_match(&self, haystack: &[u8]) -> Result { + self.is_match_at(haystack, 0) + } + + /// Returns true if and only if the matcher matches the given haystack + /// starting at the given position. + /// + /// By default, this method is implemented by calling `shortest_match_at`. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn is_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + Ok(self.shortest_match_at(haystack, at)?.is_some()) + } + + /// Returns an end location of the first match in `haystack`. If no match + /// exists, then `None` is returned. + /// + /// Note that the end location reported by this method may be less than the + /// same end location reported by `find`. For example, running `find` with + /// the pattern `a+` on the haystack `aaa` should report a range of `[0, + /// 3)`, but `shortest_match` may report `1` as the ending location since + /// that is the place at which a match is guaranteed to occur. + /// + /// This method should never report false positives or false negatives. The + /// point of this method is that some implementors may be able to provide + /// a faster implementation of this than what `find` does. + /// + /// By default, this method is implemented by calling `find`. + fn shortest_match( + &self, + haystack: &[u8], + ) -> Result, Self::Error> { + self.shortest_match_at(haystack, 0) + } + + /// Returns an end location of the first match in `haystack` starting at + /// the given position. If no match exists, then `None` is returned. + /// + /// Note that the end location reported by this method may be less than the + /// same end location reported by `find`. For example, running `find` with + /// the pattern `a+` on the haystack `aaa` should report a range of `[0, + /// 3)`, but `shortest_match` may report `1` as the ending location since + /// that is the place at which a match is guaranteed to occur. + /// + /// This method should never report false positives or false negatives. The + /// point of this method is that some implementors may be able to provide + /// a faster implementation of this than what `find` does. + /// + /// By default, this method is implemented by calling `find_at`. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `at == 0`. + fn shortest_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, Self::Error> { + Ok(self.find_at(haystack, at)?.map(|m| m.end)) + } + + /// If available, return a set of bytes that will never appear in a match + /// produced by an implementation. + /// + /// Specifically, if such a set can be determined, then it's possible for + /// callers to perform additional operations on the basis that certain + /// bytes may never match. + /// + /// For example, if a search is configured to possibly produce results + /// that span multiple lines but a caller provided pattern can never + /// match across multiple lines, then it may make sense to divert to + /// more optimized line oriented routines that don't need to handle the + /// multi-line match case. + /// + /// Implementations that produce this set must never report false + /// positives, but may produce false negatives. That is, is a byte is in + /// this set then it must be guaranteed that it is never in a match. But, + /// if a byte is not in this set, then callers cannot assume that a match + /// exists with that byte. + /// + /// By default, this returns `None`. + fn non_matching_bytes(&self) -> Option<&ByteSet> { + None + } + + /// If this matcher was compiled as a line oriented matcher, then this + /// method returns the line terminator if and only if the line terminator + /// never appears in any match produced by this matcher. If this wasn't + /// compiled as a line oriented matcher, or if the aforementioned guarantee + /// cannot be made, then this must return `None`, which is the default. + /// It is **never wrong** to return `None`, but returning a line terminator + /// when it can appear in a match results in unspecified behavior. + /// + /// The line terminator is typically `b'\n'`, but can be any single byte or + /// `CRLF`. + /// + /// By default, this returns `None`. + fn line_terminator(&self) -> Option { + None + } + + /// Return one of the following: a confirmed line match, a candidate line + /// match (which may be a false positive) or no match at all (which **must + /// not** be a false negative). When reporting a confirmed or candidate + /// match, the position returned can be any position in the line. + /// + /// By default, this never returns a candidate match, and always either + /// returns a confirmed match or no match at all. + /// + /// When a matcher can match spans over multiple lines, then the behavior + /// of this method is unspecified. Namely, use of this method only + /// makes sense in a context where the caller is looking for the next + /// matching line. That is, callers should only use this method when + /// `line_terminator` does not return `None`. + /// + /// # Design rationale + /// + /// A line matcher is, fundamentally, a normal matcher with the addition + /// of one optional method: finding a line. By default, this routine + /// is implemented via the matcher's `shortest_match` method, which + /// always yields either no match or a `LineMatchKind::Confirmed`. However, + /// implementors may provide a routine for this that can return candidate + /// lines that need subsequent verification to be confirmed as a match. + /// This can be useful in cases where it may be quicker to find candidate + /// lines via some other means instead of relying on the more general + /// implementations for `find` and `shortest_match`. + /// + /// For example, consider the regex `\w+foo\s+`. Both `find` and + /// `shortest_match` must consider the entire regex, including the `\w+` + /// and `\s+`, while searching. However, this method could look for lines + /// containing `foo` and return them as candidates. Finding `foo` might + /// be implemented as a highly optimized substring search routine (like + /// `memmem`), which is likely to be faster than whatever more generalized + /// routine is required for resolving `\w+foo\s+`. The caller is then + /// responsible for confirming whether a match exists or not. + /// + /// Note that while this method may report false positives, it must never + /// report false negatives. That is, it can never skip over lines that + /// contain a match. + fn find_candidate_line( + &self, + haystack: &[u8], + ) -> Result, Self::Error> { + Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)) + } +} + +impl<'a, M: Matcher> Matcher for &'a M { + type Captures = M::Captures; + type Error = M::Error; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, Self::Error> { + (*self).find_at(haystack, at) + } + + fn new_captures(&self) -> Result { + (*self).new_captures() + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut Self::Captures, + ) -> Result { + (*self).captures_at(haystack, at, caps) + } + + fn capture_index(&self, name: &str) -> Option { + (*self).capture_index(name) + } + + fn capture_count(&self) -> usize { + (*self).capture_count() + } + + fn find( + &self, + haystack: &[u8] + ) -> Result, Self::Error> { + (*self).find(haystack) + } + + fn find_iter( + &self, + haystack: &[u8], + matched: F, + ) -> Result<(), Self::Error> + where F: FnMut(Match) -> bool + { + (*self).find_iter(haystack, matched) + } + + fn try_find_iter( + &self, + haystack: &[u8], + matched: F, + ) -> Result, Self::Error> + where F: FnMut(Match) -> Result + { + (*self).try_find_iter(haystack, matched) + } + + fn captures( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + ) -> Result { + (*self).captures(haystack, caps) + } + + fn captures_iter( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + matched: F, + ) -> Result<(), Self::Error> + where F: FnMut(&Self::Captures) -> bool + { + (*self).captures_iter(haystack, caps, matched) + } + + fn try_captures_iter( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + matched: F, + ) -> Result, Self::Error> + where F: FnMut(&Self::Captures) -> Result + { + (*self).try_captures_iter(haystack, caps, matched) + } + + fn replace( + &self, + haystack: &[u8], + dst: &mut Vec, + append: F, + ) -> Result<(), Self::Error> + where F: FnMut(Match, &mut Vec) -> bool + { + (*self).replace(haystack, dst, append) + } + + fn replace_with_captures( + &self, + haystack: &[u8], + caps: &mut Self::Captures, + dst: &mut Vec, + append: F, + ) -> Result<(), Self::Error> + where F: FnMut(&Self::Captures, &mut Vec) -> bool + { + (*self).replace_with_captures(haystack, caps, dst, append) + } + + fn is_match(&self, haystack: &[u8]) -> Result { + (*self).is_match(haystack) + } + + fn is_match_at( + &self, + haystack: &[u8], + at: usize + ) -> Result { + (*self).is_match_at(haystack, at) + } + + fn shortest_match( + &self, + haystack: &[u8], + ) -> Result, Self::Error> { + (*self).shortest_match(haystack) + } + + fn shortest_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, Self::Error> { + (*self).shortest_match_at(haystack, at) + } + + fn non_matching_bytes(&self) -> Option<&ByteSet> { + (*self).non_matching_bytes() + } + + fn line_terminator(&self) -> Option { + (*self).line_terminator() + } + + fn find_candidate_line( + &self, + haystack: &[u8], + ) -> Result, Self::Error> { + (*self).find_candidate_line(haystack) + } +} diff --git a/grep-matcher/tests/test_matcher.rs b/grep-matcher/tests/test_matcher.rs new file mode 100644 index 000000000..9edbdf696 --- /dev/null +++ b/grep-matcher/tests/test_matcher.rs @@ -0,0 +1,208 @@ +use grep_matcher::{Captures, Match, Matcher}; +use regex::bytes::Regex; + +use util::{RegexMatcher, RegexMatcherNoCaps}; + +fn matcher(pattern: &str) -> RegexMatcher { + RegexMatcher::new(Regex::new(pattern).unwrap()) +} + +fn matcher_no_caps(pattern: &str) -> RegexMatcherNoCaps { + RegexMatcherNoCaps(Regex::new(pattern).unwrap()) +} + +fn m(start: usize, end: usize) -> Match { + Match::new(start, end) +} + +#[test] +fn find() { + let matcher = matcher(r"(\w+)\s+(\w+)"); + assert_eq!(matcher.find(b" homer simpson ").unwrap(), Some(m(1, 14))); +} + +#[test] +fn find_iter() { + let matcher = matcher(r"(\w+)\s+(\w+)"); + let mut matches = vec![]; + matcher.find_iter(b"aa bb cc dd", |m| { + matches.push(m); + true + }).unwrap(); + assert_eq!(matches, vec![m(0, 5), m(6, 11)]); + + // Test that find_iter respects short circuiting. + matches.clear(); + matcher.find_iter(b"aa bb cc dd", |m| { + matches.push(m); + false + }).unwrap(); + assert_eq!(matches, vec![m(0, 5)]); +} + +#[test] +fn try_find_iter() { + #[derive(Clone, Debug, Eq, PartialEq)] + struct MyError; + + let matcher = matcher(r"(\w+)\s+(\w+)"); + let mut matches = vec![]; + let err = matcher.try_find_iter(b"aa bb cc dd", |m| { + if matches.is_empty() { + matches.push(m); + Ok(true) + } else { + Err(MyError) + } + }).unwrap().unwrap_err(); + assert_eq!(matches, vec![m(0, 5)]); + assert_eq!(err, MyError); +} + +#[test] +fn shortest_match() { + let matcher = matcher(r"a+"); + // This tests that the default impl isn't doing anything smart, and simply + // defers to `find`. + assert_eq!(matcher.shortest_match(b"aaa").unwrap(), Some(3)); + // The actual underlying regex is smarter. + assert_eq!(matcher.re.shortest_match(b"aaa"), Some(1)); +} + +#[test] +fn captures() { + let matcher = matcher(r"(?P\w+)\s+(?P\w+)"); + assert_eq!(matcher.capture_count(), 3); + assert_eq!(matcher.capture_index("a"), Some(1)); + assert_eq!(matcher.capture_index("b"), Some(2)); + assert_eq!(matcher.capture_index("nada"), None); + + let mut caps = matcher.new_captures().unwrap(); + assert!(matcher.captures(b" homer simpson ", &mut caps).unwrap()); + assert_eq!(caps.get(0), Some(m(1, 14))); + assert_eq!(caps.get(1), Some(m(1, 6))); + assert_eq!(caps.get(2), Some(m(7, 14))); +} + +#[test] +fn captures_iter() { + let matcher = matcher(r"(?P\w+)\s+(?P\w+)"); + let mut caps = matcher.new_captures().unwrap(); + let mut matches = vec![]; + matcher.captures_iter(b"aa bb cc dd", &mut caps, |caps| { + matches.push(caps.get(0).unwrap()); + matches.push(caps.get(1).unwrap()); + matches.push(caps.get(2).unwrap()); + true + }).unwrap(); + assert_eq!(matches, vec![ + m(0, 5), m(0, 2), m(3, 5), + m(6, 11), m(6, 8), m(9, 11), + ]); + + // Test that captures_iter respects short circuiting. + matches.clear(); + matcher.captures_iter(b"aa bb cc dd", &mut caps, |caps| { + matches.push(caps.get(0).unwrap()); + matches.push(caps.get(1).unwrap()); + matches.push(caps.get(2).unwrap()); + false + }).unwrap(); + assert_eq!(matches, vec![ + m(0, 5), m(0, 2), m(3, 5), + ]); +} + +#[test] +fn try_captures_iter() { + #[derive(Clone, Debug, Eq, PartialEq)] + struct MyError; + + let matcher = matcher(r"(?P\w+)\s+(?P\w+)"); + let mut caps = matcher.new_captures().unwrap(); + let mut matches = vec![]; + let err = matcher.try_captures_iter(b"aa bb cc dd", &mut caps, |caps| { + if matches.is_empty() { + matches.push(caps.get(0).unwrap()); + matches.push(caps.get(1).unwrap()); + matches.push(caps.get(2).unwrap()); + Ok(true) + } else { + Err(MyError) + } + }).unwrap().unwrap_err(); + assert_eq!(matches, vec![m(0, 5), m(0, 2), m(3, 5)]); + assert_eq!(err, MyError); +} + +// Test that our default impls for capturing are correct. Namely, when +// capturing isn't supported by the underlying matcher, then all of the +// various capturing related APIs fail fast. +#[test] +fn no_captures() { + let matcher = matcher_no_caps(r"(?P\w+)\s+(?P\w+)"); + assert_eq!(matcher.capture_count(), 0); + assert_eq!(matcher.capture_index("a"), None); + assert_eq!(matcher.capture_index("b"), None); + assert_eq!(matcher.capture_index("nada"), None); + + let mut caps = matcher.new_captures().unwrap(); + assert!(!matcher.captures(b"homer simpson", &mut caps).unwrap()); + + let mut called = false; + matcher.captures_iter(b"homer simpson", &mut caps, |_| { + called = true; + true + }).unwrap(); + assert!(!called); +} + +#[test] +fn replace() { + let matcher = matcher(r"(\w+)\s+(\w+)"); + let mut dst = vec![]; + matcher.replace(b"aa bb cc dd", &mut dst, |_, dst| { + dst.push(b'z'); + true + }).unwrap(); + assert_eq!(dst, b"z z"); + + // Test that replacements respect short circuiting. + dst.clear(); + matcher.replace(b"aa bb cc dd", &mut dst, |_, dst| { + dst.push(b'z'); + false + }).unwrap(); + assert_eq!(dst, b"z cc dd"); +} + +#[test] +fn replace_with_captures() { + let matcher = matcher(r"(\w+)\s+(\w+)"); + let haystack = b"aa bb cc dd"; + let mut caps = matcher.new_captures().unwrap(); + let mut dst = vec![]; + matcher.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| { + caps.interpolate( + |name| matcher.capture_index(name), + haystack, + b"$2 $1", + dst, + ); + true + }).unwrap(); + assert_eq!(dst, b"bb aa dd cc"); + + // Test that replacements respect short circuiting. + dst.clear(); + matcher.replace_with_captures(haystack, &mut caps, &mut dst, |caps, dst| { + caps.interpolate( + |name| matcher.capture_index(name), + haystack, + b"$2 $1", + dst, + ); + false + }).unwrap(); + assert_eq!(dst, b"bb aa cc dd"); +} diff --git a/grep-matcher/tests/tests.rs b/grep-matcher/tests/tests.rs new file mode 100644 index 000000000..d58b2009b --- /dev/null +++ b/grep-matcher/tests/tests.rs @@ -0,0 +1,6 @@ +extern crate grep_matcher; +extern crate regex; + +mod util; + +mod test_matcher; diff --git a/grep-matcher/tests/util.rs b/grep-matcher/tests/util.rs new file mode 100644 index 000000000..57b8fc602 --- /dev/null +++ b/grep-matcher/tests/util.rs @@ -0,0 +1,104 @@ +use std::collections::HashMap; +use std::result; + +use grep_matcher::{Captures, Match, Matcher, NoCaptures, NoError}; +use regex::bytes::{CaptureLocations, Regex}; + +#[derive(Debug)] +pub struct RegexMatcher { + pub re: Regex, + pub names: HashMap, +} + +impl RegexMatcher { + pub fn new(re: Regex) -> RegexMatcher { + let mut names = HashMap::new(); + for (i, optional_name) in re.capture_names().enumerate() { + if let Some(name) = optional_name { + names.insert(name.to_string(), i); + } + } + RegexMatcher { + re: re, + names: names, + } + } +} + +type Result = result::Result; + +impl Matcher for RegexMatcher { + type Captures = RegexCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result> { + Ok(self.re + .find_at(haystack, at) + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result { + Ok(RegexCaptures(self.re.capture_locations())) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + Ok(self.re.captures_read_at(&mut caps.0, haystack, at).is_some()) + } + + fn capture_count(&self) -> usize { + self.re.captures_len() + } + + fn capture_index(&self, name: &str) -> Option { + self.names.get(name).map(|i| *i) + } + + // We purposely don't implement any other methods, so that we test the + // default impls. The "real" Regex impl for Matcher provides a few more + // impls. e.g., Its `find_iter` impl is faster than what we can do here, + // since the regex crate avoids synchronization overhead. +} + +#[derive(Debug)] +pub struct RegexMatcherNoCaps(pub Regex); + +impl Matcher for RegexMatcherNoCaps { + type Captures = NoCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result> { + Ok(self.0 + .find_at(haystack, at) + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result { + Ok(NoCaptures::new()) + } +} + +#[derive(Clone, Debug)] +pub struct RegexCaptures(CaptureLocations); + +impl Captures for RegexCaptures { + fn len(&self) -> usize { + self.0.len() + } + + fn get(&self, i: usize) -> Option { + self.0.pos(i).map(|(s, e)| Match::new(s, e)) + } +} diff --git a/grep-pcre2/Cargo.toml b/grep-pcre2/Cargo.toml new file mode 100644 index 000000000..28d1f1671 --- /dev/null +++ b/grep-pcre2/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "grep-pcre2" +version = "0.0.1" #:version +authors = ["Andrew Gallant "] +description = """ +Use PCRE2 with the 'grep' crate. +""" +documentation = "https://docs.rs/grep-pcre2" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "pcre", "backreference", "look"] +license = "Unlicense/MIT" + +[dependencies] +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +pcre2 = "0.1" diff --git a/grep-pcre2/LICENSE-MIT b/grep-pcre2/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep-pcre2/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-pcre2/README.md b/grep-pcre2/README.md new file mode 100644 index 000000000..7b9042565 --- /dev/null +++ b/grep-pcre2/README.md @@ -0,0 +1,39 @@ +grep-pcre2 +---------- +The `grep-pcre2` crate provides an implementation of the `Matcher` trait from +the `grep-matcher` crate. This implementation permits PCRE2 to be used in the +`grep` crate for fast line oriented searching. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-pcre2.svg)](https://crates.io/crates/grep-pcre2) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-pcre2](https://docs.rs/grep-pcre2) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + +If you're looking to just use PCRE2 from Rust, then you probably want the +[`pcre2`](https://docs.rs/pcre2) +crate, which provide high level safe bindings to PCRE2. + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-pcre2 = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_pcre2; +``` diff --git a/grep-pcre2/UNLICENSE b/grep-pcre2/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep-pcre2/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-pcre2/src/error.rs b/grep-pcre2/src/error.rs new file mode 100644 index 000000000..7d0b17bbe --- /dev/null +++ b/grep-pcre2/src/error.rs @@ -0,0 +1,59 @@ +use std::error; +use std::fmt; + +/// An error that can occur in this crate. +/// +/// Generally, this error corresponds to problems building a regular +/// expression, whether it's in parsing, compilation or a problem with +/// guaranteeing a configured optimization. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +impl Error { + pub(crate) fn regex(err: E) -> Error { + Error { kind: ErrorKind::Regex(err.to_string()) } + } + + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +/// The kind of an error that can occur. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurred as a result of parsing a regular expression. + /// This can be a syntax error or an error that results from attempting to + /// compile a regular expression that is too big. + /// + /// The string here is the underlying error converted to a string. + Regex(String), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::Regex(_) => "regex error", + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Regex(ref s) => write!(f, "{}", s), + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/grep-pcre2/src/lib.rs b/grep-pcre2/src/lib.rs new file mode 100644 index 000000000..245ceab73 --- /dev/null +++ b/grep-pcre2/src/lib.rs @@ -0,0 +1,15 @@ +/*! +An implementation of `grep-matcher`'s `Matcher` trait for +[PCRE2](https://www.pcre.org/). +*/ + +#![deny(missing_docs)] + +extern crate grep_matcher; +extern crate pcre2; + +pub use error::{Error, ErrorKind}; +pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}; + +mod error; +mod matcher; diff --git a/grep-pcre2/src/matcher.rs b/grep-pcre2/src/matcher.rs new file mode 100644 index 000000000..e9c51be26 --- /dev/null +++ b/grep-pcre2/src/matcher.rs @@ -0,0 +1,425 @@ +use std::collections::HashMap; + +use grep_matcher::{Captures, Match, Matcher}; +use pcre2::bytes::{CaptureLocations, Regex, RegexBuilder}; + +use error::Error; + +/// A builder for configuring the compilation of a PCRE2 regex. +#[derive(Clone, Debug)] +pub struct RegexMatcherBuilder { + builder: RegexBuilder, + case_smart: bool, + word: bool, +} + +impl RegexMatcherBuilder { + /// Create a new matcher builder with a default configuration. + pub fn new() -> RegexMatcherBuilder { + RegexMatcherBuilder { + builder: RegexBuilder::new(), + case_smart: false, + word: false, + } + } + + /// Compile the given pattern into a PCRE matcher using the current + /// configuration. + /// + /// If there was a problem compiling the pattern, then an error is + /// returned. + pub fn build(&self, pattern: &str) -> Result { + let mut builder = self.builder.clone(); + if self.case_smart && !has_uppercase_literal(pattern) { + builder.caseless(true); + } + let res = + if self.word { + let pattern = format!(r"(? &mut RegexMatcherBuilder { + self.builder.caseless(yes); + self + } + + /// Whether to enable "smart case" or not. + /// + /// When smart case is enabled, the builder will automatically enable + /// case insensitive matching based on how the pattern is written. Namely, + /// case insensitive mode is enabled when both of the following things + /// are believed to be true: + /// + /// 1. The pattern contains at least one literal character. For example, + /// `a\w` contains a literal (`a`) but `\w` does not. + /// 2. Of the literals in the pattern, none of them are considered to be + /// uppercase according to Unicode. For example, `foo\pL` has no + /// uppercase literals but `Foo\pL` does. + /// + /// Note that the implementation of this is not perfect. Namely, `\p{Ll}` + /// will prevent case insensitive matching even though it is part of a meta + /// sequence. This bug will probably never be fixed. + pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.case_smart = yes; + self + } + + /// Enables "dot all" matching. + /// + /// When enabled, the `.` metacharacter in the pattern matches any + /// character, include `\n`. When disabled (the default), `.` will match + /// any character except for `\n`. + /// + /// This option corresponds to the `s` flag. + pub fn dotall(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.dotall(yes); + self + } + + /// Enable "extended" mode in the pattern, where whitespace is ignored. + /// + /// This option corresponds to the `x` flag. + pub fn extended(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.extended(yes); + self + } + + /// Enable multiline matching mode. + /// + /// When enabled, the `^` and `$` anchors will match both at the beginning + /// and end of a subject string, in addition to matching at the start of + /// a line and the end of a line. When disabled, the `^` and `$` anchors + /// will only match at the beginning and end of a subject string. + /// + /// This option corresponds to the `m` flag. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.multi_line(yes); + self + } + + /// Enable matching of CRLF as a line terminator. + /// + /// When enabled, anchors such as `^` and `$` will match any of the + /// following as a line terminator: `\r`, `\n` or `\r\n`. + /// + /// This is disabled by default, in which case, only `\n` is recognized as + /// a line terminator. + pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.crlf(yes); + self + } + + /// Require that all matches occur on word boundaries. + /// + /// Enabling this option is subtly different than putting `\b` assertions + /// on both sides of your pattern. In particular, a `\b` assertion requires + /// that one side of it match a word character while the other match a + /// non-word character. This option, in contrast, merely requires that + /// one side match a non-word character. + /// + /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a + /// word character. However, `-2` with this `word` option enabled will + /// match the `-2` in `foo -2 bar`. + pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.word = yes; + self + } + + /// Enable Unicode matching mode. + /// + /// When enabled, the following patterns become Unicode aware: `\b`, `\B`, + /// `\d`, `\D`, `\s`, `\S`, `\w`, `\W`. + /// + /// When set, this implies UTF matching mode. It is not possible to enable + /// Unicode matching mode without enabling UTF matching mode. + /// + /// This is disabled by default. + pub fn ucp(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.ucp(yes); + self + } + + /// Enable UTF matching mode. + /// + /// When enabled, characters are treated as sequences of code units that + /// make up a single codepoint instead of as single bytes. For example, + /// this will cause `.` to match any single UTF-8 encoded codepoint, where + /// as when this is disabled, `.` will any single byte (except for `\n` in + /// both cases, unless "dot all" mode is enabled). + /// + /// Note that when UTF matching mode is enabled, every search performed + /// will do a UTF-8 validation check, which can impact performance. The + /// UTF-8 check can be disabled via the `disable_utf_check` option, but it + /// is undefined behavior to enable UTF matching mode and search invalid + /// UTF-8. + /// + /// This is disabled by default. + pub fn utf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.utf(yes); + self + } + + /// When UTF matching mode is enabled, this will disable the UTF checking + /// that PCRE2 will normally perform automatically. If UTF matching mode + /// is not enabled, then this has no effect. + /// + /// UTF checking is enabled by default when UTF matching mode is enabled. + /// If UTF matching mode is enabled and UTF checking is enabled, then PCRE2 + /// will return an error if you attempt to search a subject string that is + /// not valid UTF-8. + /// + /// # Safety + /// + /// It is undefined behavior to disable the UTF check in UTF matching mode + /// and search a subject string that is not valid UTF-8. When the UTF check + /// is disabled, callers must guarantee that the subject string is valid + /// UTF-8. + pub unsafe fn disable_utf_check(&mut self) -> &mut RegexMatcherBuilder { + self.builder.disable_utf_check(); + self + } + + /// Enable PCRE2's JIT. + /// + /// This generally speeds up matching quite a bit. The downside is that it + /// can increase the time it takes to compile a pattern. + /// + /// This is disabled by default. + pub fn jit(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.builder.jit(yes); + self + } +} + +/// An implementation of the `Matcher` trait using PCRE2. +#[derive(Clone, Debug)] +pub struct RegexMatcher { + regex: Regex, + names: HashMap, +} + +impl RegexMatcher { + /// Create a new matcher from the given pattern using the default + /// configuration. + pub fn new(pattern: &str) -> Result { + RegexMatcherBuilder::new().build(pattern) + } +} + +impl Matcher for RegexMatcher { + type Captures = RegexCaptures; + type Error = Error; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, Error> { + Ok(self.regex + .find_at(haystack, at) + .map_err(Error::regex)? + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result { + Ok(RegexCaptures::new(self.regex.capture_locations())) + } + + fn capture_count(&self) -> usize { + self.regex.captures_len() + } + + fn capture_index(&self, name: &str) -> Option { + self.names.get(name).map(|i| *i) + } + + fn try_find_iter( + &self, + haystack: &[u8], + mut matched: F, + ) -> Result, Error> + where F: FnMut(Match) -> Result + { + for result in self.regex.find_iter(haystack) { + let m = result.map_err(Error::regex)?; + match matched(Match::new(m.start(), m.end())) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } + } + Ok(Ok(())) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + Ok(self.regex + .captures_read_at(&mut caps.locs, haystack, at) + .map_err(Error::regex)? + .is_some()) + } +} + +/// Represents the match offsets of each capturing group in a match. +/// +/// The first, or `0`th capture group, always corresponds to the entire match +/// and is guaranteed to be present when a match occurs. The next capture +/// group, at index `1`, corresponds to the first capturing group in the regex, +/// ordered by the position at which the left opening parenthesis occurs. +/// +/// Note that not all capturing groups are guaranteed to be present in a match. +/// For example, in the regex, `(?P\w)|(?P\W)`, only one of `foo` +/// or `bar` will ever be set in any given match. +/// +/// In order to access a capture group by name, you'll need to first find the +/// index of the group using the corresponding matcher's `capture_index` +/// method, and then use that index with `RegexCaptures::get`. +#[derive(Clone, Debug)] +pub struct RegexCaptures { + /// Where the locations are stored. + locs: CaptureLocations, +} + +impl Captures for RegexCaptures { + fn len(&self) -> usize { + self.locs.len() + } + + fn get(&self, i: usize) -> Option { + self.locs.get(i).map(|(s, e)| Match::new(s, e)) + } +} + +impl RegexCaptures { + pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures { + RegexCaptures { locs } + } +} + +/// Determine whether the pattern contains an uppercase character which should +/// negate the effect of the smart-case option. +/// +/// Ideally we would be able to check the AST in order to correctly handle +/// things like '\p{Ll}' and '\p{Lu}' (which should be treated as explicitly +/// cased), but PCRE doesn't expose enough details for that kind of analysis. +/// For now, our 'good enough' solution is to simply perform a semi-naïve +/// scan of the input pattern and ignore all characters following a '\'. The +/// This at least lets us support the most common cases, like 'foo\w' and +/// 'foo\S', in an intuitive manner. +fn has_uppercase_literal(pattern: &str) -> bool { + let mut chars = pattern.chars(); + while let Some(c) = chars.next() { + if c == '\\' { + chars.next(); + } else if c.is_uppercase() { + return true; + } + } + false +} + +#[cfg(test)] +mod tests { + use grep_matcher::{LineMatchKind, Matcher}; + use super::*; + + // Test that enabling word matches does the right thing and demonstrate + // the difference between it and surrounding the regex in `\b`. + #[test] + fn word() { + let matcher = RegexMatcherBuilder::new() + .word(true) + .build(r"-2") + .unwrap(); + assert!(matcher.is_match(b"abc -2 foo").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .word(false) + .build(r"\b-2\b") + .unwrap(); + assert!(!matcher.is_match(b"abc -2 foo").unwrap()); + } + + // Test that enabling CRLF permits `$` to match at the end of a line. + #[test] + fn line_terminator_crlf() { + // Test normal use of `$` with a `\n` line terminator. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\n").unwrap()); + + // Test that `$` doesn't match at `\r\n` boundary normally. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(!matcher.is_match(b"abc\r\n").unwrap()); + + // Now check the CRLF handling. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .crlf(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\r\n").unwrap()); + } + + // Test that smart case works. + #[test] + fn case_smart() { + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"abc") + .unwrap(); + assert!(matcher.is_match(b"ABC").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"aBc") + .unwrap(); + assert!(!matcher.is_match(b"ABC").unwrap()); + } + + // Test that finding candidate lines works as expected. + #[test] + fn candidate_lines() { + fn is_confirmed(m: LineMatchKind) -> bool { + match m { + LineMatchKind::Confirmed(_) => true, + _ => false, + } + } + + let matcher = RegexMatcherBuilder::new() + .build(r"\wfoo\s") + .unwrap(); + let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); + assert!(is_confirmed(m)); + } +} diff --git a/grep-printer/Cargo.toml b/grep-printer/Cargo.toml new file mode 100644 index 000000000..ffc85bc83 --- /dev/null +++ b/grep-printer/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "grep-printer" +version = "0.0.1" #:version +authors = ["Andrew Gallant "] +description = """ +An implementation of the grep crate's Sink trait that provides standard +printing of search results, similar to grep itself. +""" +documentation = "https://docs.rs/grep-printer" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["grep", "pattern", "print", "printer", "sink"] +license = "Unlicense/MIT" + +[features] +default = ["serde1"] +serde1 = ["base64", "serde", "serde_derive", "serde_json"] + +[dependencies] +base64 = { version = "0.9", optional = true } +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +grep-searcher = { version = "0.0.1", path = "../grep-searcher" } +termcolor = "1" +serde = { version = "1", optional = true } +serde_derive = { version = "1", optional = true } +serde_json = { version = "1", optional = true } + +[dev-dependencies] +grep-regex = { version = "0.0.1", path = "../grep-regex" } diff --git a/grep-printer/LICENSE-MIT b/grep-printer/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep-printer/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-printer/README.md b/grep-printer/README.md new file mode 100644 index 000000000..8ccdf9514 --- /dev/null +++ b/grep-printer/README.md @@ -0,0 +1,35 @@ +grep-printer +------------ +Print results from line oriented searching in a human readable, aggregate or +JSON Lines format. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-printer.svg)](https://crates.io/crates/grep-printer) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-printer](https://docs.rs/grep-printer) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-printer = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_printer; +``` diff --git a/grep-printer/UNLICENSE b/grep-printer/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep-printer/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-printer/src/color.rs b/grep-printer/src/color.rs new file mode 100644 index 000000000..dcaca59df --- /dev/null +++ b/grep-printer/src/color.rs @@ -0,0 +1,366 @@ +use std::error; +use std::fmt; +use std::str::FromStr; + +use termcolor::{Color, ColorSpec, ParseColorError}; + +/// An error that can occur when parsing color specifications. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ColorError { + /// This occurs when an unrecognized output type is used. + UnrecognizedOutType(String), + /// This occurs when an unrecognized spec type is used. + UnrecognizedSpecType(String), + /// This occurs when an unrecognized color name is used. + UnrecognizedColor(String, String), + /// This occurs when an unrecognized style attribute is used. + UnrecognizedStyle(String), + /// This occurs when the format of a color specification is invalid. + InvalidFormat(String), +} + +impl error::Error for ColorError { + fn description(&self) -> &str { + match *self { + ColorError::UnrecognizedOutType(_) => "unrecognized output type", + ColorError::UnrecognizedSpecType(_) => "unrecognized spec type", + ColorError::UnrecognizedColor(_, _) => "unrecognized color name", + ColorError::UnrecognizedStyle(_) => "unrecognized style attribute", + ColorError::InvalidFormat(_) => "invalid color spec", + } + } +} + +impl ColorError { + fn from_parse_error(err: ParseColorError) -> ColorError { + ColorError::UnrecognizedColor( + err.invalid().to_string(), + err.to_string(), + ) + } +} + +impl fmt::Display for ColorError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + ColorError::UnrecognizedOutType(ref name) => { + write!( + f, + "unrecognized output type '{}'. Choose from: \ + path, line, column, match.", + name, + ) + } + ColorError::UnrecognizedSpecType(ref name) => { + write!( + f, + "unrecognized spec type '{}'. Choose from: \ + fg, bg, style, none.", + name, + ) + } + ColorError::UnrecognizedColor(_, ref msg) => { + write!(f, "{}", msg) + } + ColorError::UnrecognizedStyle(ref name) => { + write!( + f, + "unrecognized style attribute '{}'. Choose from: \ + nobold, bold, nointense, intense, nounderline, \ + underline.", + name, + ) + } + ColorError::InvalidFormat(ref original) => { + write!( + f, + "invalid color spec format: '{}'. Valid format \ + is '(path|line|column|match):(fg|bg|style):(value)'.", + original, + ) + } + } + } +} + +/// A merged set of color specifications. +/// +/// This set of color specifications represents the various color types that +/// are supported by the printers in this crate. A set of color specifications +/// can be created from a sequence of +/// [`UserColorSpec`s](struct.UserColorSpec.html). +#[derive(Clone, Debug, Default, Eq, PartialEq)] +pub struct ColorSpecs { + path: ColorSpec, + line: ColorSpec, + column: ColorSpec, + matched: ColorSpec, +} + +/// A single color specification provided by the user. +/// +/// ## Format +/// +/// The format of a `Spec` is a triple: `{type}:{attribute}:{value}`. Each +/// component is defined as follows: +/// +/// * `{type}` can be one of `path`, `line`, `column` or `match`. +/// * `{attribute}` can be one of `fg`, `bg` or `style`. `{attribute}` may also +/// be the special value `none`, in which case, `{value}` can be omitted. +/// * `{value}` is either a color name (for `fg`/`bg`) or a style instruction. +/// +/// `{type}` controls which part of the output should be styled. +/// +/// When `{attribute}` is `none`, then this should cause any existing style +/// settings to be cleared for the specified `type`. +/// +/// `{value}` should be a color when `{attribute}` is `fg` or `bg`, or it +/// should be a style instruction when `{attribute}` is `style`. When +/// `{attribute}` is `none`, `{value}` must be omitted. +/// +/// Valid colors are `black`, `blue`, `green`, `red`, `cyan`, `magenta`, +/// `yellow`, `white`. Extended colors can also be specified, and are formatted +/// as `x` (for 256-bit colors) or `x,x,x` (for 24-bit true color), where +/// `x` is a number between 0 and 255 inclusive. `x` may be given as a normal +/// decimal number of a hexadecimal number, where the latter is prefixed by +/// `0x`. +/// +/// Valid style instructions are `nobold`, `bold`, `intense`, `nointense`, +/// `underline`, `nounderline`. +/// +/// ## Example +/// +/// The standard way to build a `UserColorSpec` is to parse it from a string. +/// Once multiple `UserColorSpec`s have been constructed, they can be provided +/// to the standard printer where they will automatically be applied to the +/// output. +/// +/// A `UserColorSpec` can also be converted to a `termcolor::ColorSpec`: +/// +/// ```rust +/// extern crate grep_printer; +/// extern crate termcolor; +/// +/// # fn main() { +/// use termcolor::{Color, ColorSpec}; +/// use grep_printer::UserColorSpec; +/// +/// let user_spec1: UserColorSpec = "path:fg:blue".parse().unwrap(); +/// let user_spec2: UserColorSpec = "match:bg:0xff,0x7f,0x00".parse().unwrap(); +/// +/// let spec1 = user_spec1.to_color_spec(); +/// let spec2 = user_spec2.to_color_spec(); +/// +/// assert_eq!(spec1.fg(), Some(&Color::Blue)); +/// assert_eq!(spec2.bg(), Some(&Color::Rgb(0xFF, 0x7F, 0x00))); +/// # } +/// ``` +#[derive(Clone, Debug, Eq, PartialEq)] +pub struct UserColorSpec { + ty: OutType, + value: SpecValue, +} + +impl UserColorSpec { + /// Convert this user provided color specification to a specification that + /// can be used with `termcolor`. This drops the type of this specification + /// (where the type indicates where the color is applied in the standard + /// printer, e.g., to the file path or the line numbers, etc.). + pub fn to_color_spec(&self) -> ColorSpec { + let mut spec = ColorSpec::default(); + self.value.merge_into(&mut spec); + spec + } +} + +/// The actual value given by the specification. +#[derive(Clone, Debug, Eq, PartialEq)] +enum SpecValue { + None, + Fg(Color), + Bg(Color), + Style(Style), +} + +/// The set of configurable portions of ripgrep's output. +#[derive(Clone, Debug, Eq, PartialEq)] +enum OutType { + Path, + Line, + Column, + Match, +} + +/// The specification type. +#[derive(Clone, Debug, Eq, PartialEq)] +enum SpecType { + Fg, + Bg, + Style, + None, +} + +/// The set of available styles for use in the terminal. +#[derive(Clone, Debug, Eq, PartialEq)] +enum Style { + Bold, + NoBold, + Intense, + NoIntense, + Underline, + NoUnderline +} + +impl ColorSpecs { + /// Create color specifications from a list of user supplied + /// specifications. + pub fn new(specs: &[UserColorSpec]) -> ColorSpecs { + let mut merged = ColorSpecs::default(); + for spec in specs { + match spec.ty { + OutType::Path => spec.merge_into(&mut merged.path), + OutType::Line => spec.merge_into(&mut merged.line), + OutType::Column => spec.merge_into(&mut merged.column), + OutType::Match => spec.merge_into(&mut merged.matched), + } + } + merged + } + + /// Return the color specification for coloring file paths. + pub fn path(&self) -> &ColorSpec { + &self.path + } + + /// Return the color specification for coloring line numbers. + pub fn line(&self) -> &ColorSpec { + &self.line + } + + /// Return the color specification for coloring column numbers. + pub fn column(&self) -> &ColorSpec { + &self.column + } + + /// Return the color specification for coloring matched text. + pub fn matched(&self) -> &ColorSpec { + &self.matched + } +} + +impl UserColorSpec { + /// Merge this spec into the given color specification. + fn merge_into(&self, cspec: &mut ColorSpec) { + self.value.merge_into(cspec); + } +} + +impl SpecValue { + /// Merge this spec value into the given color specification. + fn merge_into(&self, cspec: &mut ColorSpec) { + match *self { + SpecValue::None => cspec.clear(), + SpecValue::Fg(ref color) => { cspec.set_fg(Some(color.clone())); } + SpecValue::Bg(ref color) => { cspec.set_bg(Some(color.clone())); } + SpecValue::Style(ref style) => { + match *style { + Style::Bold => { cspec.set_bold(true); } + Style::NoBold => { cspec.set_bold(false); } + Style::Intense => { cspec.set_intense(true); } + Style::NoIntense => { cspec.set_intense(false); } + Style::Underline => { cspec.set_underline(true); } + Style::NoUnderline => { cspec.set_underline(false); } + } + } + } + } +} + +impl FromStr for UserColorSpec { + type Err = ColorError; + + fn from_str(s: &str) -> Result { + let pieces: Vec<&str> = s.split(':').collect(); + if pieces.len() <= 1 || pieces.len() > 3 { + return Err(ColorError::InvalidFormat(s.to_string())); + } + let otype: OutType = pieces[0].parse()?; + match pieces[1].parse()? { + SpecType::None => { + Ok(UserColorSpec { + ty: otype, + value: SpecValue::None, + }) + } + SpecType::Style => { + if pieces.len() < 3 { + return Err(ColorError::InvalidFormat(s.to_string())); + } + let style: Style = pieces[2].parse()?; + Ok(UserColorSpec { ty: otype, value: SpecValue::Style(style) }) + } + SpecType::Fg => { + if pieces.len() < 3 { + return Err(ColorError::InvalidFormat(s.to_string())); + } + let color: Color = pieces[2] + .parse() + .map_err(ColorError::from_parse_error)?; + Ok(UserColorSpec { ty: otype, value: SpecValue::Fg(color) }) + } + SpecType::Bg => { + if pieces.len() < 3 { + return Err(ColorError::InvalidFormat(s.to_string())); + } + let color: Color = pieces[2] + .parse() + .map_err(ColorError::from_parse_error)?; + Ok(UserColorSpec { ty: otype, value: SpecValue::Bg(color) }) + } + } + } +} + +impl FromStr for OutType { + type Err = ColorError; + + fn from_str(s: &str) -> Result { + match &*s.to_lowercase() { + "path" => Ok(OutType::Path), + "line" => Ok(OutType::Line), + "column" => Ok(OutType::Column), + "match" => Ok(OutType::Match), + _ => Err(ColorError::UnrecognizedOutType(s.to_string())), + } + } +} + +impl FromStr for SpecType { + type Err = ColorError; + + fn from_str(s: &str) -> Result { + match &*s.to_lowercase() { + "fg" => Ok(SpecType::Fg), + "bg" => Ok(SpecType::Bg), + "style" => Ok(SpecType::Style), + "none" => Ok(SpecType::None), + _ => Err(ColorError::UnrecognizedSpecType(s.to_string())), + } + } +} + +impl FromStr for Style { + type Err = ColorError; + + fn from_str(s: &str) -> Result { + match &*s.to_lowercase() { + "bold" => Ok(Style::Bold), + "nobold" => Ok(Style::NoBold), + "intense" => Ok(Style::Intense), + "nointense" => Ok(Style::NoIntense), + "underline" => Ok(Style::Underline), + "nounderline" => Ok(Style::NoUnderline), + _ => Err(ColorError::UnrecognizedStyle(s.to_string())), + } + } +} diff --git a/grep-printer/src/counter.rs b/grep-printer/src/counter.rs new file mode 100644 index 000000000..c2faac837 --- /dev/null +++ b/grep-printer/src/counter.rs @@ -0,0 +1,90 @@ +use std::io::{self, Write}; + +use termcolor::{ColorSpec, WriteColor}; + +/// A writer that counts the number of bytes that have been successfully +/// written. +#[derive(Clone, Debug)] +pub struct CounterWriter { + wtr: W, + count: u64, + total_count: u64, +} + +impl CounterWriter { + pub fn new(wtr: W) -> CounterWriter { + CounterWriter { wtr: wtr, count: 0, total_count: 0 } + } +} + +impl CounterWriter { + /// Returns the total number of bytes written since construction or the + /// last time `reset` was called. + pub fn count(&self) -> u64 { + self.count + } + + /// Returns the total number of bytes written since construction. + pub fn total_count(&self) -> u64 { + self.total_count + self.count + } + + /// Resets the number of bytes written to `0`. + pub fn reset_count(&mut self) { + self.total_count += self.count; + self.count = 0; + } + + /// Clear resets all counting related state for this writer. + /// + /// After this call, the total count of bytes written to the underlying + /// writer is erased and reset. + #[allow(dead_code)] + pub fn clear(&mut self) { + self.count = 0; + self.total_count = 0; + } + + #[allow(dead_code)] + pub fn get_ref(&self) -> &W { + &self.wtr + } + + pub fn get_mut(&mut self) -> &mut W { + &mut self.wtr + } + + pub fn into_inner(self) -> W { + self.wtr + } +} + +impl Write for CounterWriter { + fn write(&mut self, buf: &[u8]) -> Result { + let n = self.wtr.write(buf)?; + self.count += n as u64; + Ok(n) + } + + fn flush(&mut self) -> Result<(), io::Error> { + self.wtr.flush() + } +} + +impl WriteColor for CounterWriter { + fn supports_color(&self) -> bool { + self.wtr.supports_color() + } + + fn set_color(&mut self, spec: &ColorSpec) -> io::Result<()> { + self.wtr.set_color(spec) + } + + fn reset(&mut self) -> io::Result<()> { + self.wtr.reset() + } + + fn is_synchronous(&self) -> bool { + self.wtr.is_synchronous() + } +} diff --git a/grep-printer/src/json.rs b/grep-printer/src/json.rs new file mode 100644 index 000000000..45d6d6829 --- /dev/null +++ b/grep-printer/src/json.rs @@ -0,0 +1,921 @@ +use std::io::{self, Write}; +use std::path::Path; +use std::time::Instant; + +use grep_matcher::{Match, Matcher}; +use grep_searcher::{ + Searcher, + Sink, SinkError, SinkContext, SinkContextKind, SinkFinish, SinkMatch, +}; +use serde_json as json; + +use counter::CounterWriter; +use jsont; +use stats::Stats; + +/// The configuration for the JSON printer. +/// +/// This is manipulated by the JSONBuilder and then referenced by the actual +/// implementation. Once a printer is build, the configuration is frozen and +/// cannot changed. +#[derive(Debug, Clone)] +struct Config { + pretty: bool, + max_matches: Option, + always_begin_end: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + pretty: false, + max_matches: None, + always_begin_end: false, + } + } +} + +/// A builder for a JSON lines printer. +/// +/// The builder permits configuring how the printer behaves. The JSON printer +/// has fewer configuration options than the standard printer because it is +/// a structured format, and the printer always attempts to find the most +/// information possible. +/// +/// Some configuration options, such as whether line numbers are included or +/// whether contextual lines are shown, are drawn directly from the +/// `grep_searcher::Searcher`'s configuration. +/// +/// Once a `JSON` printer is built, its configuration cannot be changed. +#[derive(Clone, Debug)] +pub struct JSONBuilder { + config: Config, +} + +impl JSONBuilder { + /// Return a new builder for configuring the JSON printer. + pub fn new() -> JSONBuilder { + JSONBuilder { config: Config::default() } + } + + /// Create a JSON printer that writes results to the given writer. + pub fn build(&self, wtr: W) -> JSON { + JSON { + config: self.config.clone(), + wtr: CounterWriter::new(wtr), + matches: vec![], + } + } + + /// Print JSON in a pretty printed format. + /// + /// Enabling this will no longer produce a "JSON lines" format, in that + /// each JSON object printed may span multiple lines. + /// + /// This is disabled by default. + pub fn pretty(&mut self, yes: bool) -> &mut JSONBuilder { + self.config.pretty = yes; + self + } + + /// Set the maximum amount of matches that are printed. + /// + /// If multi line search is enabled and a match spans multiple lines, then + /// that match is counted exactly once for the purposes of enforcing this + /// limit, regardless of how many lines it spans. + pub fn max_matches(&mut self, limit: Option) -> &mut JSONBuilder { + self.config.max_matches = limit; + self + } + + /// When enabled, the `begin` and `end` messages are always emitted, even + /// when no match is found. + /// + /// When disabled, the `begin` and `end` messages are only shown if there + /// is at least one `match` or `context` message. + /// + /// This is disabled by default. + pub fn always_begin_end(&mut self, yes: bool) -> &mut JSONBuilder { + self.config.always_begin_end = yes; + self + } +} + +/// The JSON printer, which emits results in a JSON lines format. +/// +/// This type is generic over `W`, which represents any implementation of +/// the standard library `io::Write` trait. +/// +/// # Format +/// +/// This section describes the JSON format used by this printer. +/// +/// To skip the rigamarole, take a look at the +/// [example](#example) +/// at the end. +/// +/// ## Overview +/// +/// The format of this printer is the [JSON Lines](http://jsonlines.org/) +/// format. Specifically, this printer emits a sequence of messages, where +/// each message is encoded as a single JSON value on a single line. There are +/// four different types of messages (and this number may expand over time): +/// +/// * **begin** - A message that indicates a file is being searched. +/// * **end** - A message the indicates a file is done being searched. This +/// message also include summary statistics about the search. +/// * **match** - A message that indicates a match was found. This includes +/// the text and offsets of the match. +/// * **context** - A message that indicates a contextual line was found. +/// This includes the text of the line, along with any match information if +/// the search was inverted. +/// +/// Every message is encoded in the same envelope format, which includes a tag +/// indicating the message type along with an object for the payload: +/// +/// ```json +/// { +/// "type": "{begin|end|match|context}", +/// "data": { ... } +/// } +/// ``` +/// +/// The message itself is encoded in the envelope's `data` key. +/// +/// ## Text encoding +/// +/// Before describing each message format, we first must briefly discuss text +/// encoding, since it factors into every type of message. In particular, JSON +/// may only be encoded in UTF-8, UTF-16 or UTF-32. For the purposes of this +/// printer, we need only worry about UTF-8. The problem here is that searching +/// is not limited to UTF-8 exclusively, which in turn implies that matches +/// may be reported that contain invalid UTF-8. Moreover, this printer may +/// also print file paths, and the encoding of file paths is itself not +/// guarnateed to be valid UTF-8. Therefore, this printer must deal with the +/// presence of invalid UTF-8 somehow. The printer could silently ignore such +/// things completely, or even lossily transcode invalid UTF-8 to valid UTF-8 +/// by replacing all invalid sequences with the Unicode replacement character. +/// However, this would prevent consumers of this format from accessing the +/// original data in a non-lossy way. +/// +/// Therefore, this printer will emit valid UTF-8 encoded bytes as normal +/// JSON strings and otherwise base64 encode data that isn't valid UTF-8. To +/// communicate whether this process occurs or not, strings are keyed by the +/// name `text` where as arbitrary bytes are keyed by `bytes`. +/// +/// For example, when a path is included in a message, it is formatted like so, +/// if and only if the path is valid UTF-8: +/// +/// ```json +/// { +/// "path": { +/// "text": "/home/ubuntu/lib.rs" +/// } +/// } +/// ``` +/// +/// If instead our path was `/home/ubuntu/lib\xFF.rs`, where the `\xFF` byte +/// makes it invalid UTF-8, the path would instead be encoded like so: +/// +/// ```json +/// { +/// "path": { +/// "bytes": "L2hvbWUvdWJ1bnR1L2xpYv8ucnM=" +/// } +/// } +/// ``` +/// +/// This same representation is used for reporting matches as well. +/// +/// The printer guarantees that the `text` field is used whenever the +/// underlying bytes are valid UTF-8. +/// +/// ## Wire format +/// +/// This section documents the wire format emitted by this printer, starting +/// with the four types of messages. +/// +/// Each message has its own format, and is contained inside an envelope that +/// indicates the type of message. The envelope has these fields: +/// +/// * **type** - A string indicating the type of this message. It may be one +/// of four possible strings: `begin`, `end`, `match` or `context`. This +/// list may expand over time. +/// * **data** - The actual message data. The format of this field depends on +/// the value of `type`. The possible message formats are +/// [`begin`](#message-begin), +/// [`end`](#message-end), +/// [`match`](#message-match), +/// [`context`](#message-context). +/// +/// #### Message: **begin** +/// +/// This message indicates that a search has begun. It has these fields: +/// +/// * **path** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing the file path corresponding to the search, if one is +/// present. If no file path is available, then this field is `null`. +/// +/// #### Message: **end** +/// +/// This message indicates that a search has finished. It has these fields: +/// +/// * **path** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing the file path corresponding to the search, if one is +/// present. If no file path is available, then this field is `null`. +/// * **binary_offset** - The absolute offset in the data searched +/// corresponding to the place at which binary data was detected. If no +/// binary data was detected (or if binary detection was disabled), then this +/// field is `null`. +/// * **stats** - A [`stats` object](#object-stats) that contains summary +/// statistics for the previous search. +/// +/// #### Message: **match** +/// +/// This message indicates that a match has been found. A match generally +/// corresponds to a single line of text, although it may correspond to +/// multiple lines if the search can emit matches over multiple lines. It +/// has these fields: +/// +/// * **path** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing the file path corresponding to the search, if one is +/// present. If no file path is available, then this field is `null`. +/// * **lines** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing one or more lines contained in this match. +/// * **line_number** - If the searcher has been configured to report line +/// numbers, then this corresponds to the line number of the first line +/// in `lines`. If no line numbers are available, then this is `null`. +/// * **absolute_offset** - The absolute byte offset corresponding to the start +/// of `lines` in the data being searched. +/// * **submatches** - An array of [`submatch` objects](#object-submatch) +/// corresponding to matches in `lines`. The offsets included in each +/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 +/// encoded, then the byte offsets correspond to the data after base64 +/// decoding.) The `submatch` objects are guaranteed to be sorted by their +/// starting offsets. Note that it is possible for this array to be empty, +/// for example, when searching reports inverted matches. +/// +/// #### Message: **context** +/// +/// This message indicates that a contextual line has been found. A contextual +/// line is a line that doesn't contain a match, but is generally adjacent to +/// a line that does contain a match. The precise way in which contextual lines +/// are reported is determined by the searcher. It has these fields, which are +/// exactly the same fields found in a [`match`](#message-match): +/// +/// * **path** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing the file path corresponding to the search, if one is +/// present. If no file path is available, then this field is `null`. +/// * **lines** - An +/// [arbitrary data object](#object-arbitrary-data) +/// representing one or more lines contained in this context. This includes +/// line terminators, if they're present. +/// * **line_number** - If the searcher has been configured to report line +/// numbers, then this corresponds to the line number of the first line +/// in `lines`. If no line numbers are available, then this is `null`. +/// * **absolute_offset** - The absolute byte offset corresponding to the start +/// of `lines` in the data being searched. +/// * **submatches** - An array of [`submatch` objects](#object-submatch) +/// corresponding to matches in `lines`. The offsets included in each +/// `submatch` correspond to byte offsets into `lines`. (If `lines` is base64 +/// encoded, then the byte offsets correspond to the data after base64 +/// decoding.) The `submatch` objects are guaranteed to be sorted by +/// their starting offsets. Note that it is possible for this array to be +/// non-empty, for example, when searching reports inverted matches such that +/// the original matcher could match things in the contextual lines. +/// +/// #### Object: **submatch** +/// +/// This object describes submatches found within `match` or `context` +/// messages. The `start` and `end` fields indicate the half-open interval on +/// which the match occurs (`start` is included, but `end` is not). It is +/// guaranteed that `start <= end`. It has these fields: +/// +/// * **match** - An +/// [arbitrary data object](#object-arbitrary-data) +/// corresponding to the text in this submatch. +/// * **start** - A byte offset indicating the start of this match. This offset +/// is generally reported in terms of the parent object's data. For example, +/// the `lines` field in the +/// [`match`](#message-match) or [`context`](#message-context) +/// messages. +/// * **end** - A byte offset indicating the end of this match. This offset +/// is generally reported in terms of the parent object's data. For example, +/// the `lines` field in the +/// [`match`](#message-match) or [`context`](#message-context) +/// messages. +/// +/// #### Object: **stats** +/// +/// This object is included in messages and contains summary statistics about +/// a search. It has these fields: +/// +/// * **elapsed** - A [`duration` object](#object-duration) describing the +/// length of time that elapsed while performing the search. +/// * **searches** - The number of searches that have run. For this printer, +/// this value is always `1`. (Implementations may emit additional message +/// types that use this same `stats` object that represents summary +/// statistics over multiple searches.) +/// * **searches_with_match** - The number of searches that have run that have +/// found at least one match. This is never more than `searches`. +/// * **bytes_searched** - The total number of bytes that have been searched. +/// * **bytes_printed** - The total number of bytes that have been printed. +/// This includes everything emitted by this printer. +/// * **matched_lines** - The total number of lines that participated in a +/// match. When matches may contain multiple lines, then this includes every +/// line that is part of every match. +/// * **matches** - The total number of matches. There may be multiple matches +/// per line. When matches may contain multiple lines, each match is counted +/// only once, regardless of how many lines it spans. +/// +/// #### Object: **duration** +/// +/// This object includes a few fields for describing a duration. Two of its +/// fields, `secs` and `nanos`, can be combined to give nanosecond precision +/// on systems that support it. It has these fields: +/// +/// * **secs** - A whole number of seconds indicating the length of this +/// duration. +/// * **nanos** - A fractional part of this duration represent by nanoseconds. +/// If nanosecond precision isn't supported, then this is typically rounded +/// up to the nearest number of nanoseconds. +/// * **human** - A human readable string describing the length of the +/// duration. The format of the string is itself unspecified. +/// +/// #### Object: **arbitrary data** +/// +/// This object is used whenever arbitrary data needs to be represented as a +/// JSON value. This object contains two fields, where generally only one of +/// the fields is present: +/// +/// * **text** - A normal JSON string that is UTF-8 encoded. This field is +/// populated if and only if the underlying data is valid UTF-8. +/// * **bytes** - A normal JSON string that is a base64 encoding of the +/// underlying bytes. +/// +/// More information on the motivation for this representation can be seen in +/// the section [text encoding](#text-encoding) above. +/// +/// ## Example +/// +/// This section shows a small example that includes all message types. +/// +/// Here's the file we want to search, located at `/home/andrew/sherlock`: +/// +/// ```text +/// For the Doctor Watsons of this world, as opposed to the Sherlock +/// Holmeses, success in the province of detective work must always +/// be, to a very large extent, the result of luck. Sherlock Holmes +/// can extract a clew from a wisp of straw or a flake of cigar ash; +/// but Doctor Watson has to have it taken out for him and dusted, +/// and exhibited clearly, with a label attached. +/// ``` +/// +/// Searching for `Watson` with a `before_context` of `1` with line numbers +/// enabled shows something like this using the standard printer: +/// +/// ```text +/// sherlock:1:For the Doctor Watsons of this world, as opposed to the Sherlock +/// -- +/// sherlock-4-can extract a clew from a wisp of straw or a flake of cigar ash; +/// sherlock:5:but Doctor Watson has to have it taken out for him and dusted, +/// ``` +/// +/// Here's what the same search looks like using the JSON wire format described +/// above, where in we show semi-prettified JSON (instead of a strict JSON +/// Lines format), for illustrative purposes: +/// +/// ```json +/// { +/// "type": "begin", +/// "data": { +/// "path": {"text": "/home/andrew/sherlock"}} +/// } +/// } +/// { +/// "type": "match", +/// "data": { +/// "path": {"text": "/home/andrew/sherlock"}, +/// "lines": {"text": "For the Doctor Watsons of this world, as opposed to the Sherlock\n"}, +/// "line_number": 1, +/// "absolute_offset": 0, +/// "submatches": [ +/// {"match": {"text": "Watson"}, "start": 15, "end": 21} +/// ] +/// } +/// } +/// { +/// "type": "context", +/// "data": { +/// "path": {"text": "/home/andrew/sherlock"}, +/// "lines": {"text": "can extract a clew from a wisp of straw or a flake of cigar ash;\n"}, +/// "line_number": 4, +/// "absolute_offset": 193, +/// "submatches": [] +/// } +/// } +/// { +/// "type": "match", +/// "data": { +/// "path": {"text": "/home/andrew/sherlock"}, +/// "lines": {"text": "but Doctor Watson has to have it taken out for him and dusted,\n"}, +/// "line_number": 5, +/// "absolute_offset": 258, +/// "submatches": [ +/// {"match": {"text": "Watson"}, "start": 11, "end": 17} +/// ] +/// } +/// } +/// { +/// "type": "end", +/// "data": { +/// "path": {"text": "/home/andrew/sherlock"}, +/// "binary_offset": null, +/// "stats": { +/// "elapsed": {"secs": 0, "nanos": 36296, "human": "0.0000s"}, +/// "searches": 1, +/// "searches_with_match": 1, +/// "bytes_searched": 367, +/// "bytes_printed": 1151, +/// "matched_lines": 2, +/// "matches": 2 +/// } +/// } +/// } +/// ``` +#[derive(Debug)] +pub struct JSON { + config: Config, + wtr: CounterWriter, + matches: Vec, +} + +impl JSON { + /// Return a JSON lines printer with a default configuration that writes + /// matches to the given writer. + pub fn new(wtr: W) -> JSON { + JSONBuilder::new().build(wtr) + } + + /// Return an implementation of `Sink` for the JSON printer. + /// + /// This does not associate the printer with a file path, which means this + /// implementation will never print a file path along with the matches. + pub fn sink<'s, M: Matcher>( + &'s mut self, + matcher: M, + ) -> JSONSink<'static, 's, M, W> { + JSONSink { + matcher: matcher, + json: self, + path: None, + start_time: Instant::now(), + match_count: 0, + after_context_remaining: 0, + binary_byte_offset: None, + begin_printed: false, + stats: Stats::new(), + } + } + + /// Return an implementation of `Sink` associated with a file path. + /// + /// When the printer is associated with a path, then it may, depending on + /// its configuration, print the path along with the matches found. + pub fn sink_with_path<'p, 's, M, P>( + &'s mut self, + matcher: M, + path: &'p P, + ) -> JSONSink<'p, 's, M, W> + where M: Matcher, + P: ?Sized + AsRef, + { + JSONSink { + matcher: matcher, + json: self, + path: Some(path.as_ref()), + start_time: Instant::now(), + match_count: 0, + after_context_remaining: 0, + binary_byte_offset: None, + begin_printed: false, + stats: Stats::new(), + } + } + + /// Write the given message followed by a new line. The new line is + /// determined from the configuration of the given searcher. + fn write_message(&mut self, message: &jsont::Message) -> io::Result<()> { + if self.config.pretty { + json::to_writer_pretty(&mut self.wtr, message)?; + } else { + json::to_writer(&mut self.wtr, message)?; + } + self.wtr.write(&[b'\n'])?; + Ok(()) + } +} + +impl JSON { + /// Returns true if and only if this printer has written at least one byte + /// to the underlying writer during any of the previous searches. + pub fn has_written(&self) -> bool { + self.wtr.total_count() > 0 + } + + /// Return a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.wtr.get_mut() + } + + /// Consume this printer and return back ownership of the underlying + /// writer. + pub fn into_inner(self) -> W { + self.wtr.into_inner() + } +} + +/// An implementation of `Sink` associated with a matcher and an optional file +/// path for the JSON printer. +/// +/// This type is generic over a few type parameters: +/// +/// * `'p` refers to the lifetime of the file path, if one is provided. When +/// no file path is given, then this is `'static`. +/// * `'s` refers to the lifetime of the +/// [`JSON`](struct.JSON.html) +/// printer that this type borrows. +/// * `M` refers to the type of matcher used by +/// `grep_searcher::Searcher` that is reporting results to this sink. +/// * `W` refers to the underlying writer that this printer is writing its +/// output to. +#[derive(Debug)] +pub struct JSONSink<'p, 's, M: Matcher, W: 's> { + matcher: M, + json: &'s mut JSON, + path: Option<&'p Path>, + start_time: Instant, + match_count: u64, + after_context_remaining: u64, + binary_byte_offset: Option, + begin_printed: bool, + stats: Stats, +} + +impl<'p, 's, M: Matcher, W: io::Write> JSONSink<'p, 's, M, W> { + /// Returns true if and only if this printer received a match in the + /// previous search. + /// + /// This is unaffected by the result of searches before the previous + /// search. + pub fn has_match(&self) -> bool { + self.match_count > 0 + } + + /// Return the total number of matches reported to this sink. + /// + /// This corresponds to the number of times `Sink::matched` is called. + pub fn match_count(&self) -> u64 { + self.match_count + } + + /// If binary data was found in the previous search, this returns the + /// offset at which the binary data was first detected. + /// + /// The offset returned is an absolute offset relative to the entire + /// set of bytes searched. + /// + /// This is unaffected by the result of searches before the previous + /// search. e.g., If the search prior to the previous search found binary + /// data but the previous search found no binary data, then this will + /// return `None`. + pub fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset + } + + /// Return a reference to the stats produced by the printer for all + /// searches executed on this sink. + pub fn stats(&self) -> &Stats { + &self.stats + } + + /// Execute the matcher over the given bytes and record the match + /// locations if the current configuration demands match granularity. + fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + self.json.matches.clear(); + // If printing requires knowing the location of each individual match, + // then compute and stored those right now for use later. While this + // adds an extra copy for storing the matches, we do amortize the + // allocation for it and this greatly simplifies the printing logic to + // the extent that it's easy to ensure that we never do more than + // one search to find the matches. + let matches = &mut self.json.matches; + self.matcher.find_iter(bytes, |m| { + matches.push(m); + true + }).map_err(io::Error::error_message)?; + // Don't report empty matches appearing at the end of the bytes. + if !matches.is_empty() + && matches.last().unwrap().is_empty() + && matches.last().unwrap().start() >= bytes.len() + { + matches.pop().unwrap(); + } + Ok(()) + } + + /// Returns true if this printer should quit. + /// + /// This implements the logic for handling quitting after seeing a certain + /// amount of matches. In most cases, the logic is simple, but we must + /// permit all "after" contextual lines to print after reaching the limit. + fn should_quit(&self) -> bool { + let limit = match self.json.config.max_matches { + None => return false, + Some(limit) => limit, + }; + if self.match_count < limit { + return false; + } + self.after_context_remaining == 0 + } + + /// Write the "begin" message. + fn write_begin_message(&mut self) -> io::Result<()> { + if self.begin_printed { + return Ok(()); + } + let msg = jsont::Message::Begin(jsont::Begin { + path: self.path, + }); + self.json.write_message(&msg)?; + self.begin_printed = true; + Ok(()) + } +} + +impl<'p, 's, M: Matcher, W: io::Write> Sink for JSONSink<'p, 's, M, W> { + type Error = io::Error; + + fn matched( + &mut self, + searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + self.write_begin_message()?; + + self.match_count += 1; + self.after_context_remaining = searcher.after_context() as u64; + self.record_matches(mat.bytes())?; + self.stats.add_matches(self.json.matches.len() as u64); + self.stats.add_matched_lines(mat.lines().count() as u64); + + let submatches = SubMatches::new(mat.bytes(), &self.json.matches); + let msg = jsont::Message::Match(jsont::Match { + path: self.path, + lines: mat.bytes(), + line_number: mat.line_number(), + absolute_offset: mat.absolute_byte_offset(), + submatches: submatches.as_slice(), + }); + self.json.write_message(&msg)?; + Ok(!self.should_quit()) + } + + fn context( + &mut self, + searcher: &Searcher, + ctx: &SinkContext, + ) -> Result { + self.write_begin_message()?; + self.json.matches.clear(); + + if ctx.kind() == &SinkContextKind::After { + self.after_context_remaining = + self.after_context_remaining.saturating_sub(1); + } + let submatches = + if searcher.invert_match() { + self.record_matches(ctx.bytes())?; + SubMatches::new(ctx.bytes(), &self.json.matches) + } else { + SubMatches::empty() + }; + let msg = jsont::Message::Context(jsont::Context { + path: self.path, + lines: ctx.bytes(), + line_number: ctx.line_number(), + absolute_offset: ctx.absolute_byte_offset(), + submatches: submatches.as_slice(), + }); + self.json.write_message(&msg)?; + Ok(!self.should_quit()) + } + + fn begin( + &mut self, + _searcher: &Searcher, + ) -> Result { + self.json.wtr.reset_count(); + self.start_time = Instant::now(); + self.match_count = 0; + self.after_context_remaining = 0; + self.binary_byte_offset = None; + if self.json.config.max_matches == Some(0) { + return Ok(false); + } + + if !self.json.config.always_begin_end { + return Ok(true); + } + self.write_begin_message()?; + Ok(true) + } + + fn finish( + &mut self, + _searcher: &Searcher, + finish: &SinkFinish, + ) -> Result<(), io::Error> { + if !self.begin_printed { + return Ok(()); + } + + self.binary_byte_offset = finish.binary_byte_offset(); + self.stats.add_elapsed(self.start_time.elapsed()); + self.stats.add_searches(1); + if self.match_count > 0 { + self.stats.add_searches_with_match(1); + } + self.stats.add_bytes_searched(finish.byte_count()); + self.stats.add_bytes_printed(self.json.wtr.count()); + + let msg = jsont::Message::End(jsont::End { + path: self.path, + binary_offset: finish.binary_byte_offset(), + stats: self.stats.clone(), + }); + self.json.write_message(&msg)?; + Ok(()) + } +} + +/// SubMatches represents a set of matches in a contiguous range of bytes. +/// +/// A simpler representation for this would just simply be `Vec`, +/// but the common case is exactly one match per range of bytes, which we +/// specialize here using a fixed size array without any allocation. +enum SubMatches<'a> { + Empty, + Small([jsont::SubMatch<'a>; 1]), + Big(Vec>), +} + +impl<'a> SubMatches<'a> { + /// Create a new set of match ranges from a set of matches and the + /// corresponding bytes that those matches apply to. + fn new(bytes: &'a[u8], matches: &[Match]) -> SubMatches<'a> { + if matches.len() == 1 { + let mat = matches[0]; + SubMatches::Small([jsont::SubMatch { + m: &bytes[mat], + start: mat.start(), + end: mat.end(), + }]) + } else { + let mut match_ranges = vec![]; + for &mat in matches { + match_ranges.push(jsont::SubMatch { + m: &bytes[mat], + start: mat.start(), + end: mat.end(), + }); + } + SubMatches::Big(match_ranges) + } + } + + /// Create an empty set of match ranges. + fn empty() -> SubMatches<'static> { + SubMatches::Empty + } + + /// Return this set of match ranges as a slice. + fn as_slice(&self) -> &[jsont::SubMatch] { + match *self { + SubMatches::Empty => &[], + SubMatches::Small(ref x) => x, + SubMatches::Big(ref x) => x, + } + } +} + +#[cfg(test)] +mod tests { + use grep_regex::RegexMatcher; + use grep_searcher::SearcherBuilder; + + use super::{JSON, JSONBuilder}; + + const SHERLOCK: &'static [u8] = b"\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + + fn printer_contents( + printer: &mut JSON>, + ) -> String { + String::from_utf8(printer.get_mut().to_owned()).unwrap() + } + + #[test] + fn binary_detection() { + use grep_searcher::BinaryDetection; + + const BINARY: &'static [u8] = b"\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew \x00 from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = JSONBuilder::new() + .build(vec![]); + SearcherBuilder::new() + .binary_detection(BinaryDetection::quit(b'\x00')) + .heap_limit(Some(80)) + .build() + .search_reader(&matcher, BINARY, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + + assert_eq!(got.lines().count(), 3); + let last = got.lines().last().unwrap(); + assert!(last.contains(r#""binary_offset":212,"#)); + } + + #[test] + fn max_matches() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = JSONBuilder::new() + .max_matches(Some(1)) + .build(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + + assert_eq!(got.lines().count(), 3); + } + + #[test] + fn no_match() { + let matcher = RegexMatcher::new( + r"DOES NOT MATCH" + ).unwrap(); + let mut printer = JSONBuilder::new() + .build(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + + assert!(got.is_empty()); + } + + #[test] + fn always_begin_end_no_match() { + let matcher = RegexMatcher::new( + r"DOES NOT MATCH" + ).unwrap(); + let mut printer = JSONBuilder::new() + .always_begin_end(true) + .build(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) + .unwrap(); + let got = printer_contents(&mut printer); + + assert_eq!(got.lines().count(), 2); + assert!(got.contains("begin") && got.contains("end")); + } +} diff --git a/grep-printer/src/jsont.rs b/grep-printer/src/jsont.rs new file mode 100644 index 000000000..5028349a9 --- /dev/null +++ b/grep-printer/src/jsont.rs @@ -0,0 +1,213 @@ +// This module defines the types we use for JSON serialization. We specifically +// omit deserialization, partially because there isn't a clear use case for +// them at this time, but also because deserialization will complicate things. +// Namely, the types below are designed in a way that permits JSON +// serialization with little or no allocation. Allocation is often quite +// convenient for deserialization however, so these types would become a bit +// more complex. + +use std::borrow::Cow; +use std::path::Path; +use std::str; + +use base64; +use serde::{Serialize, Serializer}; + +use stats::Stats; + +#[derive(Serialize)] +#[serde(tag = "type", content = "data")] +#[serde(rename_all = "snake_case")] +pub enum Message<'a> { + Begin(Begin<'a>), + End(End<'a>), + Match(Match<'a>), + Context(Context<'a>), +} + +#[derive(Serialize)] +pub struct Begin<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, +} + +#[derive(Serialize)] +pub struct End<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + pub binary_offset: Option, + pub stats: Stats, +} + +#[derive(Serialize)] +pub struct Match<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + #[serde(serialize_with = "ser_bytes")] + pub lines: &'a [u8], + pub line_number: Option, + pub absolute_offset: u64, + pub submatches: &'a [SubMatch<'a>], +} + +#[derive(Serialize)] +pub struct Context<'a> { + #[serde(serialize_with = "ser_path")] + pub path: Option<&'a Path>, + #[serde(serialize_with = "ser_bytes")] + pub lines: &'a [u8], + pub line_number: Option, + pub absolute_offset: u64, + pub submatches: &'a [SubMatch<'a>], +} + +#[derive(Serialize)] +pub struct SubMatch<'a> { + #[serde(rename = "match")] + #[serde(serialize_with = "ser_bytes")] + pub m: &'a [u8], + pub start: usize, + pub end: usize, +} + +/// Data represents things that look like strings, but may actually not be +/// valid UTF-8. To handle this, `Data` is serialized as an object with one +/// of two keys: `text` (for valid UTF-8) or `bytes` (for invalid UTF-8). +/// +/// The happy path is valid UTF-8, which streams right through as-is, since +/// it is natively supported by JSON. When invalid UTF-8 is found, then it is +/// represented as arbitrary bytes and base64 encoded. +#[derive(Clone, Debug, Hash, PartialEq, Eq, Serialize)] +#[serde(untagged)] +enum Data<'a> { + Text { text: Cow<'a, str> }, + Bytes { + #[serde(serialize_with = "to_base64")] + bytes: &'a [u8], + }, +} + +impl<'a> Data<'a> { + fn from_bytes(bytes: &[u8]) -> Data { + match str::from_utf8(bytes) { + Ok(text) => Data::Text { text: Cow::Borrowed(text) }, + Err(_) => Data::Bytes { bytes }, + } + } + + #[cfg(unix)] + fn from_path(path: &Path) -> Data { + use std::os::unix::ffi::OsStrExt; + + match path.to_str() { + Some(text) => Data::Text { text: Cow::Borrowed(text) }, + None => Data::Bytes { bytes: path.as_os_str().as_bytes() }, + } + } + + #[cfg(not(unix))] + fn from_path(path: &Path) -> Data { + // Using lossy conversion means some paths won't round trip precisely, + // but it's not clear what we should actually do. Serde rejects + // non-UTF-8 paths, and OsStr's are serialized as a sequence of UTF-16 + // code units on Windows. Neither seem appropriate for this use case, + // so we do the easy thing for now. + Data::Text { text: path.to_string_lossy() } + } + + // Unused deserialization routines. + + /* + fn into_bytes(self) -> Vec { + match self { + Data::Text { text } => text.into_bytes(), + Data::Bytes { bytes } => bytes, + } + } + + #[cfg(unix)] + fn into_path_buf(&self) -> PathBuf { + use std::os::unix::ffi::OsStrExt; + + match self { + Data::Text { text } => PathBuf::from(text), + Data::Bytes { bytes } => { + PathBuf::from(OsStr::from_bytes(bytes)) + } + } + } + + #[cfg(not(unix))] + fn into_path_buf(&self) -> PathBuf { + match self { + Data::Text { text } => PathBuf::from(text), + Data::Bytes { bytes } => { + PathBuf::from(String::from_utf8_lossy(&bytes).into_owned()) + } + } + } + */ +} + +fn to_base64( + bytes: T, + ser: S, +) -> Result +where T: AsRef<[u8]>, + S: Serializer +{ + ser.serialize_str(&base64::encode(&bytes)) +} + +fn ser_bytes( + bytes: T, + ser: S, +) -> Result +where T: AsRef<[u8]>, + S: Serializer +{ + Data::from_bytes(bytes.as_ref()).serialize(ser) +} + +fn ser_path( + path: &Option

, + ser: S, +) -> Result +where P: AsRef, + S: Serializer +{ + path.as_ref().map(|p| Data::from_path(p.as_ref())).serialize(ser) +} + +// The following are some deserialization helpers, in case we decide to support +// deserialization of the above types. + +/* +fn from_base64<'de, D>( + de: D, +) -> Result, D::Error> +where D: Deserializer<'de> +{ + let encoded = String::deserialize(de)?; + let decoded = base64::decode(encoded.as_bytes()) + .map_err(D::Error::custom)?; + Ok(decoded) +} + +fn deser_bytes<'de, D>( + de: D, +) -> Result, D::Error> +where D: Deserializer<'de> +{ + Data::deserialize(de).map(|datum| datum.into_bytes()) +} + +fn deser_path<'de, D>( + de: D, +) -> Result, D::Error> +where D: Deserializer<'de> +{ + Option::::deserialize(de) + .map(|opt| opt.map(|datum| datum.into_path_buf())) +} +*/ diff --git a/grep-printer/src/lib.rs b/grep-printer/src/lib.rs new file mode 100644 index 000000000..128b0bdfe --- /dev/null +++ b/grep-printer/src/lib.rs @@ -0,0 +1,106 @@ +/*! +This crate provides featureful and fast printers that interoperate with the +[`grep-searcher`](https://docs.rs/grep-searcher) +crate. + +# Brief overview + +The [`Standard`](struct.Standard.html) printer shows results in a human +readable format, and is modeled after the formats used by standard grep-like +tools. Features include, but are not limited to, cross platform terminal +coloring, search & replace, multi-line result handling and reporting summary +statistics. + +The [`JSON`](struct.JSON.html) printer shows results in a machine readable +format. To facilitate a stream of search results, the format uses +[JSON Lines](http://jsonlines.org/) +by emitting a series of messages as search results are found. + +The [`Summary`](struct.Summary.html) printer shows *aggregate* results for a +single search in a human readable format, and is modeled after similar formats +found in standard grep-like tools. This printer is useful for showing the total +number of matches and/or printing file paths that either contain or don't +contain matches. + +# Example + +This example shows how to create a "standard" printer and execute a search. + +``` +extern crate grep_regex; +extern crate grep_printer; +extern crate grep_searcher; + +use std::error::Error; + +use grep_regex::RegexMatcher; +use grep_printer::Standard; +use grep_searcher::Searcher; + +const SHERLOCK: &'static [u8] = b"\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + +# fn main() { example().unwrap(); } +fn example() -> Result<(), Box> { + let matcher = RegexMatcher::new(r"Sherlock")?; + let mut printer = Standard::new_no_color(vec![]); + Searcher::new().search_slice(&matcher, SHERLOCK, printer.sink(&matcher))?; + + // into_inner gives us back the underlying writer we provided to + // new_no_color, which is wrapped in a termcolor::NoColor. Thus, a second + // into_inner gives us back the actual buffer. + let output = String::from_utf8(printer.into_inner().into_inner())?; + let expected = "\ +1:For the Doctor Watsons of this world, as opposed to the Sherlock +3:be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq!(output, expected); + Ok(()) +} +``` +*/ + +#![deny(missing_docs)] + +#[cfg(feature = "serde1")] +extern crate base64; +extern crate grep_matcher; +#[cfg(test)] +extern crate grep_regex; +extern crate grep_searcher; +#[cfg(feature = "serde1")] +extern crate serde; +#[cfg(feature = "serde1")] +#[macro_use] +extern crate serde_derive; +#[cfg(feature = "serde1")] +extern crate serde_json; +extern crate termcolor; + +pub use color::{ColorError, ColorSpecs, UserColorSpec}; +#[cfg(feature = "serde1")] +pub use json::{JSON, JSONBuilder, JSONSink}; +pub use standard::{Standard, StandardBuilder, StandardSink}; +pub use stats::Stats; +pub use summary::{Summary, SummaryBuilder, SummaryKind, SummarySink}; +pub use util::PrinterPath; + +#[macro_use] +mod macros; + +mod color; +mod counter; +#[cfg(feature = "serde1")] +mod json; +#[cfg(feature = "serde1")] +mod jsont; +mod standard; +mod stats; +mod summary; +mod util; diff --git a/grep-printer/src/macros.rs b/grep-printer/src/macros.rs new file mode 100644 index 000000000..7b48e8c2a --- /dev/null +++ b/grep-printer/src/macros.rs @@ -0,0 +1,23 @@ +#[cfg(test)] +#[macro_export] +macro_rules! assert_eq_printed { + ($expected:expr, $got:expr) => { + let expected = &*$expected; + let got = &*$got; + if expected != got { + panic!(" +printed outputs differ! + +expected: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +{} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +got: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +{} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +", expected, got); + } + } +} diff --git a/grep-printer/src/standard.rs b/grep-printer/src/standard.rs new file mode 100644 index 000000000..6146a8b9e --- /dev/null +++ b/grep-printer/src/standard.rs @@ -0,0 +1,3049 @@ +use std::cell::{Cell, RefCell}; +use std::cmp; +use std::io::{self, Write}; +use std::path::Path; +use std::sync::Arc; +use std::time::Instant; + +use grep_matcher::{Match, Matcher}; +use grep_searcher::{ + LineStep, Searcher, + Sink, SinkError, + SinkContext, SinkContextKind, SinkFinish, SinkMatch, +}; +use termcolor::{ColorSpec, NoColor, WriteColor}; + +use color::ColorSpecs; +use counter::CounterWriter; +use stats::Stats; +use util::{ + PrinterPath, Replacer, Sunk, + trim_ascii_prefix, trim_ascii_prefix_range, +}; + +/// The configuration for the standard printer. +/// +/// This is manipulated by the StandardBuilder and then referenced by the +/// actual implementation. Once a printer is build, the configuration is frozen +/// and cannot changed. +#[derive(Debug, Clone)] +struct Config { + colors: ColorSpecs, + stats: bool, + heading: bool, + path: bool, + only_matching: bool, + per_match: bool, + replacement: Arc>>, + max_columns: Option, + max_matches: Option, + column: bool, + byte_offset: bool, + trim_ascii: bool, + separator_search: Arc>>, + separator_context: Arc>>, + separator_field_match: Arc>, + separator_field_context: Arc>, + separator_path: Option, + path_terminator: Option, +} + +impl Default for Config { + fn default() -> Config { + Config { + colors: ColorSpecs::default(), + stats: false, + heading: false, + path: true, + only_matching: false, + per_match: false, + replacement: Arc::new(None), + max_columns: None, + max_matches: None, + column: false, + byte_offset: false, + trim_ascii: false, + separator_search: Arc::new(None), + separator_context: Arc::new(Some(b"--".to_vec())), + separator_field_match: Arc::new(b":".to_vec()), + separator_field_context: Arc::new(b"-".to_vec()), + separator_path: None, + path_terminator: None, + } + } +} + +/// A builder for the "standard" grep-like printer. +/// +/// The builder permits configuring how the printer behaves. Configurable +/// behavior includes, but is not limited to, limiting the number of matches, +/// tweaking separators, executing pattern replacements, recording statistics +/// and setting colors. +/// +/// Some configuration options, such as the display of line numbers or +/// contextual lines, are drawn directly from the +/// `grep_searcher::Searcher`'s configuration. +/// +/// Once a `Standard` printer is built, its configuration cannot be changed. +#[derive(Clone, Debug)] +pub struct StandardBuilder { + config: Config, +} + +impl StandardBuilder { + /// Return a new builder for configuring the standard printer. + pub fn new() -> StandardBuilder { + StandardBuilder { config: Config::default() } + } + + /// Build a printer using any implementation of `termcolor::WriteColor`. + /// + /// The implementation of `WriteColor` used here controls whether colors + /// are used or not when colors have been configured using the + /// `color_specs` method. + /// + /// For maximum portability, callers should generally use either + /// `termcolor::StandardStream` or `termcolor::BufferedStandardStream` + /// where appropriate, which will automatically enable colors on Windows + /// when possible. + /// + /// However, callers may also provide an arbitrary writer using the + /// `termcolor::Ansi` or `termcolor::NoColor` wrappers, which always enable + /// colors via ANSI escapes or always disable colors, respectively. + /// + /// As a convenience, callers may use `build_no_color` to automatically + /// select the `termcolor::NoColor` wrapper to avoid needing to import + /// from `termcolor` explicitly. + pub fn build(&self, wtr: W) -> Standard { + Standard { + config: self.config.clone(), + wtr: RefCell::new(CounterWriter::new(wtr)), + matches: vec![], + } + } + + /// Build a printer from any implementation of `io::Write` and never emit + /// any colors, regardless of the user color specification settings. + /// + /// This is a convenience routine for + /// `StandardBuilder::build(termcolor::NoColor::new(wtr))`. + pub fn build_no_color( + &self, + wtr: W, + ) -> Standard> { + self.build(NoColor::new(wtr)) + } + + /// Set the user color specifications to use for coloring in this printer. + /// + /// A [`UserColorSpec`](struct.UserColorSpec.html) can be constructed from + /// a string in accordance with the color specification format. See the + /// `UserColorSpec` type documentation for more details on the format. + /// A [`ColorSpecs`](struct.ColorSpecs.html) can then be generated from + /// zero or more `UserColorSpec`s. + /// + /// Regardless of the color specifications provided here, whether color + /// is actually used or not is determined by the implementation of + /// `WriteColor` provided to `build`. For example, if `termcolor::NoColor` + /// is provided to `build`, then no color will ever be printed regardless + /// of the color specifications provided here. + /// + /// This completely overrides any previous color specifications. This does + /// not add to any previously provided color specifications on this + /// builder. + pub fn color_specs( + &mut self, + specs: ColorSpecs, + ) -> &mut StandardBuilder { + self.config.colors = specs; + self + } + + /// Enable the gathering of various aggregate statistics. + /// + /// When this is enabled (it's disabled by default), statistics will be + /// gathered for all uses of `Standard` printer returned by `build`, + /// including but not limited to, the total number of matches, the total + /// number of bytes searched and the total number of bytes printed. + /// + /// Aggregate statistics can be accessed via the sink's + /// [`StandardSink::stats`](struct.StandardSink.html#method.stats) + /// method. + /// + /// When this is enabled, this printer may need to do extra work in order + /// to compute certain statistics, which could cause the search to take + /// longer. + /// + /// For a complete description of available statistics, see + /// [`Stats`](struct.Stats.html). + pub fn stats(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.stats = yes; + self + } + + /// Enable the use of "headings" in the printer. + /// + /// When this is enabled, and if a file path has been given to the printer, + /// then the file path will be printed once on its own line before showing + /// any matches. If the heading is not the first thing emitted by the + /// printer, then a line terminator is printed before the heading. + /// + /// By default, this option is disabled. When disabled, the printer will + /// not show any heading and will instead print the file path (if one is + /// given) on the same line as each matching (or context) line. + pub fn heading(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.heading = yes; + self + } + + /// When enabled, if a path was given to the printer, then it is shown in + /// the output (either as a heading or as a prefix to each matching line). + /// When disabled, then no paths are ever included in the output even when + /// a path is provided to the printer. + /// + /// This is enabled by default. + pub fn path(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.path = yes; + self + } + + /// Only print the specific matches instead of the entire line containing + /// each match. Each match is printed on its own line. When multi line + /// search is enabled, then matches spanning multiple lines are printed + /// such that only the matching portions of each line are shown. + pub fn only_matching(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.only_matching = yes; + self + } + + /// Print at least one line for every match. + /// + /// This is similar to the `only_matching` option, except the entire line + /// is printed for each match. This is typically useful in conjunction with + /// the `column` option, which will show the starting column number for + /// every match on every line. + /// + /// When multi-line mode is enabled, each match and its accompanying lines + /// are printed. As with single line matches, if a line contains multiple + /// matches (even if only partially), then that line is printed once for + /// each match it participates in. + pub fn per_match(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.per_match = yes; + self + } + + /// Set the bytes that will be used to replace each occurrence of a match + /// found. + /// + /// The replacement bytes given may include references to capturing groups, + /// which may either be in index form (e.g., `$2`) or can reference named + /// capturing groups if present in the original pattern (e.g., `$foo`). + /// + /// For documentation on the full format, please see the `Matcher` trait's + /// `interpolate` method. + pub fn replacement( + &mut self, + replacement: Option>, + ) -> &mut StandardBuilder { + self.config.replacement = Arc::new(replacement); + self + } + + /// Set the maximum number of columns allowed for each line printed. A + /// single column is heuristically defined as a single byte. + /// + /// If a line is found which exceeds this maximum, then it is replaced + /// with a message indicating that the line has been omitted. + /// + /// The default is to not specify a limit, in which each matching or + /// contextual line is printed regardless of how long it is. + pub fn max_columns(&mut self, limit: Option) -> &mut StandardBuilder { + self.config.max_columns = limit; + self + } + + /// Set the maximum amount of matching lines that are printed. + /// + /// If multi line search is enabled and a match spans multiple lines, then + /// that match is counted exactly once for the purposes of enforcing this + /// limit, regardless of how many lines it spans. + pub fn max_matches(&mut self, limit: Option) -> &mut StandardBuilder { + self.config.max_matches = limit; + self + } + + /// Print the column number of the first match in a line. + /// + /// This option is convenient for use with `per_match` which will print a + /// line for every match along with the starting offset for that match. + /// + /// Column numbers are computed in terms of bytes from the start of the + /// line being printed. + /// + /// For matches that span multiple lines, the column number for each + /// matching line is in terms of the first matching line. + /// + /// This is disabled by default. + pub fn column(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.column = yes; + self + } + + /// Print the absolute byte offset of the beginning of each line printed. + /// + /// The absolute byte offset starts from the beginning of each search and + /// is zero based. + /// + /// If the `only_matching` option is set, then this will print the absolute + /// byte offset of the beginning of each match. + pub fn byte_offset(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.byte_offset = yes; + self + } + + /// When enabled, all lines will have prefix ASCII whitespace trimmed + /// before being written. + /// + /// This is disabled by default. + pub fn trim_ascii(&mut self, yes: bool) -> &mut StandardBuilder { + self.config.trim_ascii = yes; + self + } + + /// Set the separator used between sets of search results. + /// + /// When this is set, then it will be printed on its own line immediately + /// before the results for a single search if and only if a previous search + /// had already printed results. In effect, this permits showing a divider + /// between sets of search results that does not appear at the beginning + /// or end of all search results. + /// + /// To reproduce the classic grep format, this is typically set to `--` + /// (the same as the context separator) if and only if contextual lines + /// have been requested, but disabled otherwise. + /// + /// By default, this is disabled. + pub fn separator_search( + &mut self, + sep: Option>, + ) -> &mut StandardBuilder { + self.config.separator_search = Arc::new(sep); + self + } + + /// Set the separator used between discontiguous runs of search context, + /// but only when the searcher is configured to report contextual lines. + /// + /// The separator is always printed on its own line, even if it's empty. + /// + /// If no separator is set, then nothing is printed when a context break + /// occurs. + /// + /// By default, this is set to `--`. + pub fn separator_context( + &mut self, + sep: Option>, + ) -> &mut StandardBuilder { + self.config.separator_context = Arc::new(sep); + self + } + + /// Set the separator used between fields emitted for matching lines. + /// + /// For example, when the searcher has line numbers enabled, this printer + /// will print the line number before each matching line. The bytes given + /// here will be written after the line number but before the matching + /// line. + /// + /// By default, this is set to `:`. + pub fn separator_field_match( + &mut self, + sep: Vec, + ) -> &mut StandardBuilder { + self.config.separator_field_match = Arc::new(sep); + self + } + + /// Set the separator used between fields emitted for context lines. + /// + /// For example, when the searcher has line numbers enabled, this printer + /// will print the line number before each context line. The bytes given + /// here will be written after the line number but before the context + /// line. + /// + /// By default, this is set to `-`. + pub fn separator_field_context( + &mut self, + sep: Vec, + ) -> &mut StandardBuilder { + self.config.separator_field_context = Arc::new(sep); + self + } + + /// Set the path separator used when printing file paths. + /// + /// When a printer is configured with a file path, and when a match is + /// found, that file path will be printed (either as a heading or as a + /// prefix to each matching or contextual line, depending on other + /// configuration settings). Typically, printing is done by emitting the + /// file path as is. However, this setting provides the ability to use a + /// different path separator from what the current environment has + /// configured. + /// + /// A typical use for this option is to permit cygwin users on Windows to + /// set the path separator to `/` instead of using the system default of + /// `\`. + pub fn separator_path( + &mut self, + sep: Option, + ) -> &mut StandardBuilder { + self.config.separator_path = sep; + self + } + + /// Set the path terminator used. + /// + /// The path terminator is a byte that is printed after every file path + /// emitted by this printer. + /// + /// If no path terminator is set (the default), then paths are terminated + /// by either new lines (for when `heading` is enabled) or the match or + /// context field separators (e.g., `:` or `-`). + pub fn path_terminator( + &mut self, + terminator: Option, + ) -> &mut StandardBuilder { + self.config.path_terminator = terminator; + self + } +} + +/// The standard printer, which implements grep-like formatting, including +/// color support. +/// +/// A default printer can be created with either of the `Standard::new` or +/// `Standard::new_no_color` constructors. However, there are a considerable +/// number of options that configure this printer's output. Those options can +/// be configured using [`StandardBuilder`](struct.StandardBuilder.html). +/// +/// This type is generic over `W`, which represents any implementation +/// of the `termcolor::WriteColor` trait. If colors are not desired, +/// then the `new_no_color` constructor can be used, or, alternatively, +/// the `termcolor::NoColor` adapter can be used to wrap any `io::Write` +/// implementation without enabling any colors. +#[derive(Debug)] +pub struct Standard { + config: Config, + wtr: RefCell>, + matches: Vec, +} + +impl Standard { + /// Return a standard printer with a default configuration that writes + /// matches to the given writer. + /// + /// The writer should be an implementation of `termcolor::WriteColor` + /// and not just a bare implementation of `io::Write`. To use a normal + /// `io::Write` implementation (simultaneously sacrificing colors), use + /// the `new_no_color` constructor. + pub fn new(wtr: W) -> Standard { + StandardBuilder::new().build(wtr) + } +} + +impl Standard> { + /// Return a standard printer with a default configuration that writes + /// matches to the given writer. + /// + /// The writer can be any implementation of `io::Write`. With this + /// constructor, the printer will never emit colors. + pub fn new_no_color(wtr: W) -> Standard> { + StandardBuilder::new().build_no_color(wtr) + } +} + +impl Standard { + /// Return an implementation of `Sink` for the standard printer. + /// + /// This does not associate the printer with a file path, which means this + /// implementation will never print a file path along with the matches. + pub fn sink<'s, M: Matcher>( + &'s mut self, + matcher: M, + ) -> StandardSink<'static, 's, M, W> { + let stats = + if self.config.stats { + Some(Stats::new()) + } else { + None + }; + let needs_match_granularity = self.needs_match_granularity(); + StandardSink { + matcher: matcher, + standard: self, + replacer: Replacer::new(), + path: None, + start_time: Instant::now(), + match_count: 0, + after_context_remaining: 0, + binary_byte_offset: None, + stats: stats, + needs_match_granularity: needs_match_granularity, + } + } + + /// Return an implementation of `Sink` associated with a file path. + /// + /// When the printer is associated with a path, then it may, depending on + /// its configuration, print the path along with the matches found. + pub fn sink_with_path<'p, 's, M, P>( + &'s mut self, + matcher: M, + path: &'p P, + ) -> StandardSink<'p, 's, M, W> + where M: Matcher, + P: ?Sized + AsRef, + { + if !self.config.path { + return self.sink(matcher); + } + let stats = + if self.config.stats { + Some(Stats::new()) + } else { + None + }; + let ppath = PrinterPath::with_separator( + path.as_ref(), self.config.separator_path); + let needs_match_granularity = self.needs_match_granularity(); + StandardSink { + matcher: matcher, + standard: self, + replacer: Replacer::new(), + path: Some(ppath), + start_time: Instant::now(), + match_count: 0, + after_context_remaining: 0, + binary_byte_offset: None, + stats: stats, + needs_match_granularity: needs_match_granularity, + } + } + + /// Returns true if and only if the configuration of the printer requires + /// us to find each individual match in the lines reported by the searcher. + /// + /// We care about this distinction because finding each individual match + /// costs more, so we only do it when we need to. + fn needs_match_granularity(&self) -> bool { + let supports_color = self.wtr.borrow().supports_color(); + let match_colored = !self.config.colors.matched().is_none(); + + // Coloring requires identifying each individual match. + (supports_color && match_colored) + // The column feature requires finding the position of the first match. + || self.config.column + // Requires finding each match for performing replacement. + || self.config.replacement.is_some() + // Emitting a line for each match requires finding each match. + || self.config.per_match + // Emitting only the match requires finding each match. + || self.config.only_matching + // Computing certain statistics requires finding each match. + || self.config.stats + } +} + +impl Standard { + /// Returns true if and only if this printer has written at least one byte + /// to the underlying writer during any of the previous searches. + pub fn has_written(&self) -> bool { + self.wtr.borrow().total_count() > 0 + } + + /// Return a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.wtr.get_mut().get_mut() + } + + /// Consume this printer and return back ownership of the underlying + /// writer. + pub fn into_inner(self) -> W { + self.wtr.into_inner().into_inner() + } +} + +/// An implementation of `Sink` associated with a matcher and an optional file +/// path for the standard printer. +/// +/// A `Sink` can be created via the +/// [`Standard::sink`](struct.Standard.html#method.sink) +/// or +/// [`Standard::sink_with_path`](struct.Standard.html#method.sink_with_path) +/// methods, depending on whether you want to include a file path in the +/// printer's output. +/// +/// Building a `StandardSink` is cheap, and callers should create a new one +/// for each thing that is searched. After a search has completed, callers may +/// query this sink for information such as whether a match occurred or whether +/// binary data was found (and if so, the offset at which it occurred). +/// +/// This type is generic over a few type parameters: +/// +/// * `'p` refers to the lifetime of the file path, if one is provided. When +/// no file path is given, then this is `'static`. +/// * `'s` refers to the lifetime of the +/// [`Standard`](struct.Standard.html) +/// printer that this type borrows. +/// * `M` refers to the type of matcher used by +/// `grep_searcher::Searcher` that is reporting results to this sink. +/// * `W` refers to the underlying writer that this printer is writing its +/// output to. +#[derive(Debug)] +pub struct StandardSink<'p, 's, M: Matcher, W: 's> { + matcher: M, + standard: &'s mut Standard, + replacer: Replacer, + path: Option>, + start_time: Instant, + match_count: u64, + after_context_remaining: u64, + binary_byte_offset: Option, + stats: Option, + needs_match_granularity: bool, +} + +impl<'p, 's, M: Matcher, W: WriteColor> StandardSink<'p, 's, M, W> { + /// Returns true if and only if this printer received a match in the + /// previous search. + /// + /// This is unaffected by the result of searches before the previous + /// search on this sink. + pub fn has_match(&self) -> bool { + self.match_count > 0 + } + + /// Return the total number of matches reported to this sink. + /// + /// This corresponds to the number of times `Sink::matched` is called + /// on the previous search. + /// + /// This is unaffected by the result of searches before the previous + /// search on this sink. + pub fn match_count(&self) -> u64 { + self.match_count + } + + /// If binary data was found in the previous search, this returns the + /// offset at which the binary data was first detected. + /// + /// The offset returned is an absolute offset relative to the entire + /// set of bytes searched. + /// + /// This is unaffected by the result of searches before the previous + /// search. e.g., If the search prior to the previous search found binary + /// data but the previous search found no binary data, then this will + /// return `None`. + pub fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset + } + + /// Return a reference to the stats produced by the printer for all + /// searches executed on this sink. + /// + /// This only returns stats if they were requested via the + /// [`StandardBuilder`](struct.StandardBuilder.html) + /// configuration. + pub fn stats(&self) -> Option<&Stats> { + self.stats.as_ref() + } + + /// Execute the matcher over the given bytes and record the match + /// locations if the current configuration demands match granularity. + fn record_matches(&mut self, bytes: &[u8]) -> io::Result<()> { + self.standard.matches.clear(); + if !self.needs_match_granularity { + return Ok(()); + } + // If printing requires knowing the location of each individual match, + // then compute and stored those right now for use later. While this + // adds an extra copy for storing the matches, we do amortize the + // allocation for it and this greatly simplifies the printing logic to + // the extent that it's easy to ensure that we never do more than + // one search to find the matches (well, for replacements, we do one + // additional search to perform the actual replacement). + let matches = &mut self.standard.matches; + self.matcher.find_iter(bytes, |m| { + matches.push(m); + true + }).map_err(io::Error::error_message)?; + // Don't report empty matches appearing at the end of the bytes. + if !matches.is_empty() + && matches.last().unwrap().is_empty() + && matches.last().unwrap().start() >= bytes.len() + { + matches.pop().unwrap(); + } + Ok(()) + } + + /// If the configuration specifies a replacement, then this executes the + /// replacement, lazily allocating memory if necessary. + /// + /// To access the result of a replacement, use `replacer.replacement()`. + fn replace(&mut self, bytes: &[u8]) -> io::Result<()> { + self.replacer.clear(); + if self.standard.config.replacement.is_some() { + let replacement = (*self.standard.config.replacement) + .as_ref() + .map(|r| &*r) + .unwrap(); + self.replacer.replace_all( + &self.matcher, + bytes, + replacement, + )?; + } + Ok(()) + } + + /// Returns true if this printer should quit. + /// + /// This implements the logic for handling quitting after seeing a certain + /// amount of matches. In most cases, the logic is simple, but we must + /// permit all "after" contextual lines to print after reaching the limit. + fn should_quit(&self) -> bool { + let limit = match self.standard.config.max_matches { + None => return false, + Some(limit) => limit, + }; + if self.match_count < limit { + return false; + } + self.after_context_remaining == 0 + } +} + +impl<'p, 's, M: Matcher, W: WriteColor> Sink for StandardSink<'p, 's, M, W> { + type Error = io::Error; + + fn matched( + &mut self, + searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + self.match_count += 1; + self.after_context_remaining = searcher.after_context() as u64; + + self.record_matches(mat.bytes())?; + self.replace(mat.bytes())?; + + if let Some(ref mut stats) = self.stats { + stats.add_matches(self.standard.matches.len() as u64); + stats.add_matched_lines(mat.lines().count() as u64); + } + + StandardImpl::from_match(searcher, self, mat).sink()?; + Ok(!self.should_quit()) + } + + fn context( + &mut self, + searcher: &Searcher, + ctx: &SinkContext, + ) -> Result { + self.standard.matches.clear(); + self.replacer.clear(); + + if ctx.kind() == &SinkContextKind::After { + self.after_context_remaining = + self.after_context_remaining.saturating_sub(1); + } + if searcher.invert_match() { + self.record_matches(ctx.bytes())?; + self.replace(ctx.bytes())?; + } + StandardImpl::from_context(searcher, self, ctx).sink()?; + Ok(!self.should_quit()) + } + + fn context_break( + &mut self, + searcher: &Searcher, + ) -> Result { + StandardImpl::new(searcher, self).write_context_separator()?; + Ok(true) + } + + fn begin( + &mut self, + _searcher: &Searcher, + ) -> Result { + self.standard.wtr.borrow_mut().reset_count(); + self.start_time = Instant::now(); + self.match_count = 0; + self.after_context_remaining = 0; + self.binary_byte_offset = None; + if self.standard.config.max_matches == Some(0) { + return Ok(false); + } + Ok(true) + } + + fn finish( + &mut self, + _searcher: &Searcher, + finish: &SinkFinish, + ) -> Result<(), io::Error> { + self.binary_byte_offset = finish.binary_byte_offset(); + if let Some(stats) = self.stats.as_mut() { + stats.add_elapsed(self.start_time.elapsed()); + stats.add_searches(1); + if self.match_count > 0 { + stats.add_searches_with_match(1); + } + stats.add_bytes_searched(finish.byte_count()); + stats.add_bytes_printed(self.standard.wtr.borrow().count()); + } + Ok(()) + } +} + +/// The actual implementation of the standard printer. This couples together +/// the searcher, the sink implementation and information about the match. +/// +/// A StandardImpl is initialized every time a match or a contextual line is +/// reported. +#[derive(Debug)] +struct StandardImpl<'a, M: 'a + Matcher, W: 'a> { + searcher: &'a Searcher, + sink: &'a StandardSink<'a, 'a, M, W>, + sunk: Sunk<'a>, + /// Set to true if and only if we are writing a match with color. + in_color_match: Cell, +} + +impl<'a, M: Matcher, W: WriteColor> StandardImpl<'a, M, W> { + /// Bundle self with a searcher and return the core implementation of Sink. + fn new( + searcher: &'a Searcher, + sink: &'a StandardSink, + ) -> StandardImpl<'a, M, W> { + StandardImpl { + searcher: searcher, + sink: sink, + sunk: Sunk::empty(), + in_color_match: Cell::new(false), + } + } + + /// Bundle self with a searcher and return the core implementation of Sink + /// for use with handling matching lines. + fn from_match( + searcher: &'a Searcher, + sink: &'a StandardSink, + mat: &'a SinkMatch<'a>, + ) -> StandardImpl<'a, M, W> { + let sunk = Sunk::from_sink_match( + mat, + &sink.standard.matches, + sink.replacer.replacement(), + ); + StandardImpl { + sunk: sunk, + ..StandardImpl::new(searcher, sink) + } + } + + /// Bundle self with a searcher and return the core implementation of Sink + /// for use with handling contextual lines. + fn from_context( + searcher: &'a Searcher, + sink: &'a StandardSink, + ctx: &'a SinkContext<'a>, + ) -> StandardImpl<'a, M, W> { + let sunk = Sunk::from_sink_context( + ctx, + &sink.standard.matches, + sink.replacer.replacement(), + ); + StandardImpl { + sunk: sunk, + ..StandardImpl::new(searcher, sink) + } + } + + fn sink(&self) -> io::Result<()> { + self.write_search_prelude()?; + if self.sunk.matches().is_empty() { + if self.multi_line() && !self.is_context() { + self.sink_fast_multi_line() + } else { + self.sink_fast() + } + } else { + if self.multi_line() && !self.is_context() { + self.sink_slow_multi_line() + } else { + self.sink_slow() + } + } + } + + /// Print matches (limited to one line) quickly by avoiding the detection + /// of each individual match in the lines reported in the given + /// `SinkMatch`. + /// + /// This should only be used when the configuration does not demand match + /// granularity and the searcher is not in multi line mode. + fn sink_fast(&self) -> io::Result<()> { + debug_assert!(self.sunk.matches().is_empty()); + debug_assert!(!self.multi_line() || self.is_context()); + + self.write_prelude( + self.sunk.absolute_byte_offset(), + self.sunk.line_number(), + None, + )?; + self.write_line(self.sunk.bytes()) + } + + /// Print matches (possibly spanning more than one line) quickly by + /// avoiding the detection of each individual match in the lines reported + /// in the given `SinkMatch`. + /// + /// This should only be used when the configuration does not demand match + /// granularity. This may be used when the searcher is in multi line mode. + fn sink_fast_multi_line(&self) -> io::Result<()> { + debug_assert!(self.sunk.matches().is_empty()); + // This isn't actually a required invariant for using this method, + // but if we wind up here and multi line mode is disabled, then we + // should still treat it as a bug since we should be using matched_fast + // instead. + debug_assert!(self.multi_line()); + + let line_term = self.searcher.line_terminator().as_byte(); + let mut absolute_byte_offset = self.sunk.absolute_byte_offset(); + for (i, line) in self.sunk.lines(line_term).enumerate() { + self.write_prelude( + absolute_byte_offset, + self.sunk.line_number().map(|n| n + i as u64), + None, + )?; + absolute_byte_offset += line.len() as u64; + + self.write_line(line)?; + } + Ok(()) + } + + /// Print a matching line where the configuration of the printer requires + /// finding each individual match (e.g., for coloring). + fn sink_slow(&self) -> io::Result<()> { + debug_assert!(!self.sunk.matches().is_empty()); + debug_assert!(!self.multi_line() || self.is_context()); + + if self.config().only_matching { + for &m in self.sunk.matches() { + self.write_prelude( + self.sunk.absolute_byte_offset() + m.start() as u64, + self.sunk.line_number(), + Some(m.start() as u64 + 1), + )?; + + let buf = &self.sunk.bytes()[m]; + self.write_colored_line(&[Match::new(0, buf.len())], buf)?; + } + } else if self.config().per_match { + for &m in self.sunk.matches() { + self.write_prelude( + self.sunk.absolute_byte_offset() + m.start() as u64, + self.sunk.line_number(), + Some(m.start() as u64 + 1), + )?; + self.write_colored_line(&[m], self.sunk.bytes())?; + } + } else { + self.write_prelude( + self.sunk.absolute_byte_offset(), + self.sunk.line_number(), + Some(self.sunk.matches()[0].start() as u64 + 1), + )?; + self.write_colored_line(self.sunk.matches(), self.sunk.bytes())?; + } + Ok(()) + } + + fn sink_slow_multi_line(&self) -> io::Result<()> { + debug_assert!(!self.sunk.matches().is_empty()); + debug_assert!(self.multi_line()); + + if self.config().only_matching { + return self.sink_slow_multi_line_only_matching(); + } else if self.config().per_match { + return self.sink_slow_multi_per_match(); + } + + let line_term = self.searcher.line_terminator().as_byte(); + let bytes = self.sunk.bytes(); + let matches = self.sunk.matches(); + let mut midx = 0; + let mut count = 0; + let mut stepper = LineStep::new(line_term, 0, bytes.len()); + while let Some((start, end)) = stepper.next(bytes) { + let mut line = Match::new(start, end); + self.write_prelude( + self.sunk.absolute_byte_offset() + line.start() as u64, + self.sunk.line_number().map(|n| n + count), + Some(matches[0].start() as u64 + 1), + )?; + count += 1; + if self.exceeds_max_columns(&bytes[line]) { + self.write_exceeded_line()?; + continue; + } + if self.has_line_terminator(&bytes[line]) { + line = line.with_end(line.end() - 1); + } + if self.config().trim_ascii { + line = self.trim_ascii_prefix_range(bytes, line); + } + + while !line.is_empty() { + if matches[midx].end() <= line.start() { + if midx + 1 < matches.len() { + midx += 1; + continue; + } else { + self.end_color_match()?; + self.write(&bytes[line])?; + break; + } + } + let m = matches[midx]; + + if line.start() < m.start() { + let upto = cmp::min(line.end(), m.start()); + self.end_color_match()?; + self.write(&bytes[line.with_end(upto)])?; + line = line.with_start(upto); + } else { + let upto = cmp::min(line.end(), m.end()); + self.start_color_match()?; + self.write(&bytes[line.with_end(upto)])?; + line = line.with_start(upto); + } + } + self.end_color_match()?; + self.write_line_term()?; + } + Ok(()) + } + + fn sink_slow_multi_line_only_matching(&self) -> io::Result<()> { + let line_term = self.searcher.line_terminator().as_byte(); + let spec = self.config().colors.matched(); + let bytes = self.sunk.bytes(); + let matches = self.sunk.matches(); + let mut midx = 0; + let mut count = 0; + let mut stepper = LineStep::new(line_term, 0, bytes.len()); + while let Some((start, end)) = stepper.next(bytes) { + let mut line = Match::new(start, end); + if self.has_line_terminator(&bytes[line]) { + line = line.with_end(line.end() - 1); + } + if self.config().trim_ascii { + line = self.trim_ascii_prefix_range(bytes, line); + } + while !line.is_empty() { + if matches[midx].end() <= line.start() { + if midx + 1 < matches.len() { + midx += 1; + continue; + } else { + break; + } + } + let m = matches[midx]; + + if line.start() < m.start() { + let upto = cmp::min(line.end(), m.start()); + line = line.with_start(upto); + } else { + let upto = cmp::min(line.end(), m.end()); + self.write_prelude( + self.sunk.absolute_byte_offset() + m.start() as u64, + self.sunk.line_number().map(|n| n + count), + Some(m.start() as u64 + 1), + )?; + + let buf = &bytes[line.with_end(upto)]; + line = line.with_start(upto); + if self.exceeds_max_columns(&buf) { + self.write_exceeded_line()?; + continue; + } + self.write_spec(spec, buf)?; + self.write_line_term()?; + } + } + count += 1; + } + Ok(()) + } + + fn sink_slow_multi_per_match(&self) -> io::Result<()> { + let line_term = self.searcher.line_terminator().as_byte(); + let spec = self.config().colors.matched(); + let bytes = self.sunk.bytes(); + for &m in self.sunk.matches() { + let mut m = m; + let mut count = 0; + let mut stepper = LineStep::new(line_term, 0, bytes.len()); + while let Some((start, end)) = stepper.next(bytes) { + let mut line = Match::new(start, end); + if line.start() >= m.end() { + break; + } else if line.end() <= m.start() { + count += 1; + continue; + } + self.write_prelude( + self.sunk.absolute_byte_offset() + line.start() as u64, + self.sunk.line_number().map(|n| n + count), + Some(m.start() as u64 + 1), + )?; + count += 1; + if self.exceeds_max_columns(&bytes[line]) { + self.write_exceeded_line()?; + continue; + } + if self.has_line_terminator(&bytes[line]) { + line = line.with_end(line.end() - 1); + } + if self.config().trim_ascii { + line = self.trim_ascii_prefix_range(bytes, line); + } + + while !line.is_empty() { + if m.end() <= line.start() { + self.write(&bytes[line])?; + line = line.with_start(line.end()); + } else if line.start() < m.start() { + let upto = cmp::min(line.end(), m.start()); + self.write(&bytes[line.with_end(upto)])?; + line = line.with_start(upto); + } else { + let upto = cmp::min(line.end(), m.end()); + self.write_spec(spec, &bytes[line.with_end(upto)])?; + line = line.with_start(upto); + } + } + self.write_line_term()?; + } + } + Ok(()) + } + + /// Write the beginning part of a matching line. This (may) include things + /// like the file path, line number among others, depending on the + /// configuration and the parameters given. + #[inline(always)] + fn write_prelude( + &self, + absolute_byte_offset: u64, + line_number: Option, + column: Option, + ) -> io::Result<()> { + let sep = self.separator_field(); + + if !self.config().heading { + self.write_path_field(sep)?; + } + if let Some(n) = line_number { + self.write_line_number(n, sep)?; + } + if let Some(n) = column { + if self.config().column { + self.write_column_number(n, sep)?; + } + } + if self.config().byte_offset { + self.write_byte_offset(absolute_byte_offset, sep)?; + } + Ok(()) + } + + #[inline(always)] + fn write_line( + &self, + line: &[u8], + ) -> io::Result<()> { + if self.exceeds_max_columns(line) { + self.write_exceeded_line()?; + } else { + self.write_trim(line)?; + if !self.has_line_terminator(line) { + self.write_line_term()?; + } + } + Ok(()) + } + + fn write_colored_line( + &self, + matches: &[Match], + line: &[u8], + ) -> io::Result<()> { + // If we know we aren't going to emit color, then we can go faster. + let spec = self.config().colors.matched(); + if !self.wtr().borrow().supports_color() || spec.is_none() { + return self.write_line(line); + } + + let mut last_written = + if !self.config().trim_ascii { + 0 + } else { + self.trim_ascii_prefix_range( + line, + Match::new(0, line.len()), + ).start() + }; + for mut m in matches.iter().map(|&m| m) { + if last_written < m.start() { + self.end_color_match()?; + self.write(&line[last_written..m.start()])?; + } else if last_written < m.end() { + m = m.with_start(last_written); + } else { + continue; + } + if !m.is_empty() { + self.start_color_match()?; + self.write(&line[m])?; + } + last_written = m.end(); + } + self.end_color_match()?; + self.write(&line[last_written..])?; + if !self.has_line_terminator(line) { + self.write_line_term()?; + } + Ok(()) + } + + fn write_exceeded_line(&self) -> io::Result<()> { + if self.sunk.original_matches().is_empty() { + if self.is_context() { + self.write(b"[Omitted long context line]")?; + } else { + self.write(b"[Omitted long matching line]")?; + } + } else { + if self.config().only_matching { + if self.is_context() { + self.write(b"[Omitted long context line]")?; + } else { + self.write(b"[Omitted long matching line]")?; + } + } else { + write!( + self.wtr().borrow_mut(), + "[Omitted long line with {} matches]", + self.sunk.original_matches().len(), + )?; + } + } + self.write_line_term()?; + Ok(()) + } + + /// If this printer has a file path associated with it, then this will + /// write that path to the underlying writer followed by a line terminator. + /// (If a path terminator is set, then that is used instead of the line + /// terminator.) + fn write_path_line(&self) -> io::Result<()> { + if let Some(path) = self.path() { + self.write_spec(self.config().colors.path(), path.as_bytes())?; + if let Some(term) = self.config().path_terminator { + self.write(&[term])?; + } else { + self.write_line_term()?; + } + } + Ok(()) + } + + /// If this printer has a file path associated with it, then this will + /// write that path to the underlying writer followed by the given field + /// separator. (If a path terminator is set, then that is used instead of + /// the field separator.) + fn write_path_field(&self, field_separator: &[u8]) -> io::Result<()> { + if let Some(path) = self.path() { + self.write_spec(self.config().colors.path(), path.as_bytes())?; + if let Some(term) = self.config().path_terminator { + self.write(&[term])?; + } else { + self.write(field_separator)?; + } + } + Ok(()) + } + + fn write_search_prelude(&self) -> io::Result<()> { + let this_search_written = self.wtr().borrow().count() > 0; + if this_search_written { + return Ok(()); + } + if let Some(ref sep) = *self.config().separator_search { + let ever_written = self.wtr().borrow().total_count() > 0; + if ever_written { + self.write(sep)?; + self.write_line_term()?; + } + } + if self.config().heading { + self.write_path_line()?; + } + Ok(()) + } + + fn write_context_separator(&self) -> io::Result<()> { + if let Some(ref sep) = *self.config().separator_context { + self.write(sep)?; + self.write_line_term()?; + } + Ok(()) + } + + fn write_line_number( + &self, + line_number: u64, + field_separator: &[u8], + ) -> io::Result<()> { + let n = line_number.to_string(); + self.write_spec(self.config().colors.line(), n.as_bytes())?; + self.write(field_separator)?; + Ok(()) + } + + fn write_column_number( + &self, + column_number: u64, + field_separator: &[u8], + ) -> io::Result<()> { + let n = column_number.to_string(); + self.write_spec(self.config().colors.column(), n.as_bytes())?; + self.write(field_separator)?; + Ok(()) + } + + fn write_byte_offset( + &self, + offset: u64, + field_separator: &[u8], + ) -> io::Result<()> { + let n = offset.to_string(); + self.write_spec(self.config().colors.column(), n.as_bytes())?; + self.write(field_separator)?; + Ok(()) + } + + fn write_line_term(&self) -> io::Result<()> { + self.write(self.searcher.line_terminator().as_bytes()) + } + + fn write_spec(&self, spec: &ColorSpec, buf: &[u8]) -> io::Result<()> { + let mut wtr = self.wtr().borrow_mut(); + wtr.set_color(spec)?; + wtr.write_all(buf)?; + wtr.reset()?; + Ok(()) + } + + fn start_color_match(&self) -> io::Result<()> { + if self.in_color_match.get() { + return Ok(()); + } + self.wtr().borrow_mut().set_color(self.config().colors.matched())?; + self.in_color_match.set(true); + Ok(()) + } + + fn end_color_match(&self) -> io::Result<()> { + if !self.in_color_match.get() { + return Ok(()); + } + self.wtr().borrow_mut().reset()?; + self.in_color_match.set(false); + Ok(()) + } + + fn write_trim(&self, buf: &[u8]) -> io::Result<()> { + if !self.config().trim_ascii { + return self.write(buf); + } + self.write(self.trim_ascii_prefix(buf)) + } + + fn write(&self, buf: &[u8]) -> io::Result<()> { + self.wtr().borrow_mut().write_all(buf) + } + + fn has_line_terminator(&self, buf: &[u8]) -> bool { + buf.last() == Some(&self.searcher.line_terminator().as_byte()) + } + + fn is_context(&self) -> bool { + self.sunk.context_kind().is_some() + } + + /// Return the underlying configuration for this printer. + fn config(&self) -> &'a Config { + &self.sink.standard.config + } + + /// Return the underlying writer that we are printing to. + fn wtr(&self) -> &'a RefCell> { + &self.sink.standard.wtr + } + + /// Return the path associated with this printer, if one exists. + fn path(&self) -> Option<&'a PrinterPath<'a>> { + self.sink.path.as_ref() + } + + /// Return the appropriate field separator based on whether we are emitting + /// matching or contextual lines. + fn separator_field(&self) -> &[u8] { + if self.is_context() { + &self.config().separator_field_context + } else { + &self.config().separator_field_match + } + } + + /// Returns true if and only if the given line exceeds the maximum number + /// of columns set. If no maximum is set, then this always returns false. + fn exceeds_max_columns(&self, line: &[u8]) -> bool { + self.config().max_columns.map_or(false, |m| line.len() as u64 > m) + } + + /// Returns true if and only if the searcher may report matches over + /// multiple lines. + /// + /// Note that this doesn't just return whether the searcher is in multi + /// line mode, but also checks if the mater can match over multiple lines. + /// If it can't, then we don't need multi line handling, even if the + /// searcher has multi line mode enabled. + fn multi_line(&self) -> bool { + self.searcher.multi_line_with_matcher(&self.sink.matcher) + } + + /// Trim prefix ASCII spaces from the given slice and return the + /// corresponding range. + /// + /// This stops trimming a prefix as soon as it sees non-whitespace or a + /// line terminator. + fn trim_ascii_prefix_range(&self, slice: &[u8], range: Match) -> Match { + trim_ascii_prefix_range(self.searcher.line_terminator(), slice, range) + } + + /// Trim prefix ASCII spaces from the given slice and return the + /// corresponding sub-slice. + fn trim_ascii_prefix<'s>(&self, slice: &'s [u8]) -> &'s [u8] { + trim_ascii_prefix(self.searcher.line_terminator(), slice) + } +} + +#[cfg(test)] +mod tests { + use grep_regex::RegexMatcher; + use grep_searcher::SearcherBuilder; + use termcolor::NoColor; + + use super::{Standard, StandardBuilder}; + + const SHERLOCK: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + + #[allow(dead_code)] + const SHERLOCK_CRLF: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock\r +Holmeses, success in the province of detective work must always\r +be, to a very large extent, the result of luck. Sherlock Holmes\r +can extract a clew from a wisp of straw or a flake of cigar ash;\r +but Doctor Watson has to have it taken out for him and dusted,\r +and exhibited clearly, with a label attached.\ +"; + + fn printer_contents( + printer: &mut Standard>>, + ) -> String { + String::from_utf8(printer.get_mut().get_ref().to_owned()).unwrap() + } + + #[test] + fn reports_match() { + let matcher = RegexMatcher::new("Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + assert!(sink.has_match()); + + let matcher = RegexMatcher::new("zzzzz").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + assert!(!sink.has_match()); + } + + #[test] + fn reports_binary() { + use grep_searcher::BinaryDetection; + + let matcher = RegexMatcher::new("Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + assert!(sink.binary_byte_offset().is_none()); + + let matcher = RegexMatcher::new(".+").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .binary_detection(BinaryDetection::quit(b'\x00')) + .build() + .search_reader(&matcher, &b"abc\x00"[..], &mut sink) + .unwrap(); + assert_eq!(sink.binary_byte_offset(), Some(3)); + } + + #[test] + fn reports_stats() { + use std::time::Duration; + + let matcher = RegexMatcher::new("Sherlock|opposed").unwrap(); + let mut printer = StandardBuilder::new() + .stats(true) + .build(NoColor::new(vec![])); + let stats = { + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + sink.stats().unwrap().clone() + }; + let buf = printer_contents(&mut printer); + + assert!(stats.elapsed() > Duration::default()); + assert_eq!(stats.searches(), 1); + assert_eq!(stats.searches_with_match(), 1); + assert_eq!(stats.bytes_searched(), SHERLOCK.len() as u64); + assert_eq!(stats.bytes_printed(), buf.len() as u64); + assert_eq!(stats.matched_lines(), 2); + assert_eq!(stats.matches(), 3); + + } + + #[test] + fn reports_stats_multiple() { + use std::time::Duration; + + let matcher = RegexMatcher::new("Sherlock|opposed").unwrap(); + let mut printer = StandardBuilder::new() + .stats(true) + .build(NoColor::new(vec![])); + let stats = { + let mut sink = printer.sink(&matcher); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, &b"zzzzzzzzzz"[..], &mut sink) + .unwrap(); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader(&matcher, SHERLOCK.as_bytes(), &mut sink) + .unwrap(); + sink.stats().unwrap().clone() + }; + let buf = printer_contents(&mut printer); + + assert!(stats.elapsed() > Duration::default()); + assert_eq!(stats.searches(), 3); + assert_eq!(stats.searches_with_match(), 2); + assert_eq!(stats.bytes_searched(), 10 + 2 * SHERLOCK.len() as u64); + assert_eq!(stats.bytes_printed(), buf.len() as u64); + assert_eq!(stats.matched_lines(), 4); + assert_eq!(stats.matches(), 6); + } + + #[test] + fn context_break() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .separator_context(Some(b"--abc--".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +--abc-- +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn context_break_multiple_no_heading() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .separator_search(Some(b"--xyz--".to_vec())) + .separator_context(Some(b"--abc--".to_vec())) + .build(NoColor::new(vec![])); + + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +--abc-- +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +--xyz-- +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +--abc-- +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn context_break_multiple_heading() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .heading(true) + .separator_search(Some(b"--xyz--".to_vec())) + .separator_context(Some(b"--abc--".to_vec())) + .build(NoColor::new(vec![])); + + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +--abc-- +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +--xyz-- +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +--abc-- +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn path() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .path(false) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the Doctor Watsons of this world, as opposed to the Sherlock +5:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn separator_field() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .separator_field_match(b"!!".to_vec()) + .separator_field_context(b"^^".to_vec()) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +sherlock!!For the Doctor Watsons of this world, as opposed to the Sherlock +sherlock^^Holmeses, success in the province of detective work must always +-- +sherlock^^can extract a clew from a wisp of straw or a flake of cigar ash; +sherlock!!but Doctor Watson has to have it taken out for him and dusted, +sherlock^^and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn separator_path() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .separator_path(Some(b'Z')) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "books/sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +booksZsherlock:For the Doctor Watsons of this world, as opposed to the Sherlock +booksZsherlock:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn path_terminator() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .path_terminator(Some(b'Z')) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "books/sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +books/sherlockZFor the Doctor Watsons of this world, as opposed to the Sherlock +books/sherlockZbut Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn heading() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .heading(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +sherlock +For the Doctor Watsons of this world, as opposed to the Sherlock +but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn no_heading() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .heading(false) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock +sherlock:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn no_heading_multiple() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .heading(false) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let matcher = RegexMatcher::new("Sherlock").unwrap(); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock +sherlock:but Doctor Watson has to have it taken out for him and dusted, +sherlock:For the Doctor Watsons of this world, as opposed to the Sherlock +sherlock:be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn heading_multiple() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .heading(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let matcher = RegexMatcher::new("Sherlock").unwrap(); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +sherlock +For the Doctor Watsons of this world, as opposed to the Sherlock +but Doctor Watson has to have it taken out for him and dusted, +sherlock +For the Doctor Watsons of this world, as opposed to the Sherlock +be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn trim_ascii() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .trim_ascii(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + " Watson".as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +Watson +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn trim_ascii_multi_line() { + let matcher = RegexMatcher::new("(?s:.{0})Watson").unwrap(); + let mut printer = StandardBuilder::new() + .trim_ascii(true) + .stats(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + " Watson".as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +Watson +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn trim_ascii_with_line_term() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .trim_ascii(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .before_context(1) + .build() + .search_reader( + &matcher, + "\n Watson".as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1- +2:Watson +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn line_number() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the Doctor Watsons of this world, as opposed to the Sherlock +5:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn line_number_multi_line() { + let matcher = RegexMatcher::new("(?s)Watson.+Watson").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the Doctor Watsons of this world, as opposed to the Sherlock +2:Holmeses, success in the province of detective work must always +3:be, to a very large extent, the result of luck. Sherlock Holmes +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn column_number() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +16:For the Doctor Watsons of this world, as opposed to the Sherlock +12:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn column_number_multi_line() { + let matcher = RegexMatcher::new("(?s)Watson.+Watson").unwrap(); + let mut printer = StandardBuilder::new() + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +16:For the Doctor Watsons of this world, as opposed to the Sherlock +16:Holmeses, success in the province of detective work must always +16:be, to a very large extent, the result of luck. Sherlock Holmes +16:can extract a clew from a wisp of straw or a flake of cigar ash; +16:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn byte_offset() { + let matcher = RegexMatcher::new("Watson").unwrap(); + let mut printer = StandardBuilder::new() + .byte_offset(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +258:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn byte_offset_multi_line() { + let matcher = RegexMatcher::new("(?s)Watson.+Watson").unwrap(); + let mut printer = StandardBuilder::new() + .byte_offset(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_columns() { + let matcher = RegexMatcher::new("ash|dusted").unwrap(); + let mut printer = StandardBuilder::new() + .max_columns(Some(63)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +[Omitted long matching line] +but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_columns_with_count() { + let matcher = RegexMatcher::new("cigar|ash|dusted").unwrap(); + let mut printer = StandardBuilder::new() + .stats(true) + .max_columns(Some(63)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +[Omitted long line with 2 matches] +but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_columns_multi_line() { + let matcher = RegexMatcher::new("(?s)ash.+dusted").unwrap(); + let mut printer = StandardBuilder::new() + .max_columns(Some(63)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +[Omitted long matching line] +but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_matches() { + let matcher = RegexMatcher::new("Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .max_matches(Some(1)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_matches_context() { + // after context: 1 + let matcher = RegexMatcher::new("Doctor Watsons").unwrap(); + let mut printer = StandardBuilder::new() + .max_matches(Some(1)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +"; + assert_eq_printed!(expected, got); + + // after context: 4 + let mut printer = StandardBuilder::new() + .max_matches(Some(1)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .after_context(4) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + + // after context: 1, max matches: 2 + let matcher = RegexMatcher::new("Doctor Watsons|but Doctor").unwrap(); + let mut printer = StandardBuilder::new() + .max_matches(Some(2)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +-- +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + + // after context: 4, max matches: 2 + let mut printer = StandardBuilder::new() + .max_matches(Some(2)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .after_context(4) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_matches_multi_line1() { + let matcher = RegexMatcher::new("(?s:.{0})Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .max_matches(Some(1)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn max_matches_multi_line2() { + let matcher = RegexMatcher::new( + r"(?s)Watson.+?(Holmeses|clearly)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .max_matches(Some(1)) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(false) + .multi_line(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching() { + let matcher = RegexMatcher::new("Doctor Watsons|Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:Doctor Watsons +1:57:Sherlock +3:49:Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching_multi_line1() { + let matcher = RegexMatcher::new( + r"(?s:.{0})(Doctor Watsons|Sherlock)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:Doctor Watsons +1:57:Sherlock +3:49:Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching_multi_line2() { + let matcher = RegexMatcher::new( + r"(?s)Watson.+?(Holmeses|clearly)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:16:Watsons of this world, as opposed to the Sherlock +2:16:Holmeses +5:12:Watson has to have it taken out for him and dusted, +6:12:and exhibited clearly +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching_max_columns() { + let matcher = RegexMatcher::new("Doctor Watsons|Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .max_columns(Some(10)) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:[Omitted long matching line] +1:57:Sherlock +3:49:Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching_max_columns_multi_line1() { + let matcher = RegexMatcher::new( + r"(?s:.{0})(Doctor Watsons|Sherlock)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .max_columns(Some(10)) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:[Omitted long matching line] +1:57:Sherlock +3:49:Sherlock +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn only_matching_max_columns_multi_line2() { + let matcher = RegexMatcher::new( + r"(?s)Watson.+?(Holmeses|clearly)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .max_columns(Some(50)) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:16:Watsons of this world, as opposed to the Sherlock +2:16:Holmeses +5:12:[Omitted long matching line] +6:12:and exhibited clearly +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn per_match() { + let matcher = RegexMatcher::new("Doctor Watsons|Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .per_match(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:For the Doctor Watsons of this world, as opposed to the Sherlock +1:57:For the Doctor Watsons of this world, as opposed to the Sherlock +3:49:be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn per_match_multi_line1() { + let matcher = RegexMatcher::new( + r"(?s:.{0})(Doctor Watsons|Sherlock)" + ).unwrap(); + let mut printer = StandardBuilder::new() + .per_match(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:9:For the Doctor Watsons of this world, as opposed to the Sherlock +1:57:For the Doctor Watsons of this world, as opposed to the Sherlock +3:49:be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn per_match_multi_line2() { + let matcher = RegexMatcher::new( + r"(?s)Watson.+?(Holmeses|clearly)", + ).unwrap(); + let mut printer = StandardBuilder::new() + .per_match(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:16:For the Doctor Watsons of this world, as opposed to the Sherlock +2:16:Holmeses, success in the province of detective work must always +5:12:but Doctor Watson has to have it taken out for him and dusted, +6:12:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn per_match_multi_line3() { + let matcher = RegexMatcher::new( + r"(?s)Watson.+?Holmeses|always.+?be", + ).unwrap(); + let mut printer = StandardBuilder::new() + .per_match(true) + .column(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:16:For the Doctor Watsons of this world, as opposed to the Sherlock +2:16:Holmeses, success in the province of detective work must always +2:123:Holmeses, success in the province of detective work must always +3:123:be, to a very large extent, the result of luck. Sherlock Holmes +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn replacement_passthru() { + let matcher = RegexMatcher::new(r"Sherlock|Doctor (\w+)").unwrap(); + let mut printer = StandardBuilder::new() + .replacement(Some(b"doctah $1 MD".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .passthru(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the doctah Watsons MD of this world, as opposed to the doctah MD +2-Holmeses, success in the province of detective work must always +3:be, to a very large extent, the result of luck. doctah MD Holmes +4-can extract a clew from a wisp of straw or a flake of cigar ash; +5:but doctah Watson MD has to have it taken out for him and dusted, +6-and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn replacement() { + let matcher = RegexMatcher::new(r"Sherlock|Doctor (\w+)").unwrap(); + let mut printer = StandardBuilder::new() + .replacement(Some(b"doctah $1 MD".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the doctah Watsons MD of this world, as opposed to the doctah MD +3:be, to a very large extent, the result of luck. doctah MD Holmes +5:but doctah Watson MD has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn replacement_max_columns() { + let matcher = RegexMatcher::new(r"Sherlock|Doctor (\w+)").unwrap(); + let mut printer = StandardBuilder::new() + .max_columns(Some(67)) + .replacement(Some(b"doctah $1 MD".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:[Omitted long line with 2 matches] +3:be, to a very large extent, the result of luck. doctah MD Holmes +5:but doctah Watson MD has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn replacement_only_matching() { + let matcher = RegexMatcher::new(r"Sherlock|Doctor (\w+)").unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .replacement(Some(b"doctah $1 MD".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:doctah Watsons MD +1:doctah MD +3:doctah MD +5:doctah Watson MD +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn replacement_per_match() { + let matcher = RegexMatcher::new(r"Sherlock|Doctor (\w+)").unwrap(); + let mut printer = StandardBuilder::new() + .per_match(true) + .replacement(Some(b"doctah $1 MD".to_vec())) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1:For the doctah Watsons MD of this world, as opposed to the doctah MD +1:For the doctah Watsons MD of this world, as opposed to the doctah MD +3:be, to a very large extent, the result of luck. doctah MD Holmes +5:but doctah Watson MD has to have it taken out for him and dusted, +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert() { + let matcher = RegexMatcher::new(r"Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .invert_match(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +2:Holmeses, success in the province of detective work must always +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert_multi_line() { + let matcher = RegexMatcher::new(r"(?s:.{0})Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .invert_match(true) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +2:Holmeses, success in the province of detective work must always +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert_context() { + let matcher = RegexMatcher::new(r"Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .invert_match(true) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1-For the Doctor Watsons of this world, as opposed to the Sherlock +2:Holmeses, success in the province of detective work must always +3-be, to a very large extent, the result of luck. Sherlock Holmes +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert_context_multi_line() { + let matcher = RegexMatcher::new(r"(?s:.{0})Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .invert_match(true) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1-For the Doctor Watsons of this world, as opposed to the Sherlock +2:Holmeses, success in the province of detective work must always +3-be, to a very large extent, the result of luck. Sherlock Holmes +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert_context_only_matching() { + let matcher = RegexMatcher::new(r"Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .line_number(true) + .invert_match(true) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1-Sherlock +2:Holmeses, success in the province of detective work must always +3-Sherlock +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } + + #[test] + fn invert_context_only_matching_multi_line() { + let matcher = RegexMatcher::new(r"(?s:.{0})Sherlock").unwrap(); + let mut printer = StandardBuilder::new() + .only_matching(true) + .build(NoColor::new(vec![])); + SearcherBuilder::new() + .multi_line(true) + .line_number(true) + .invert_match(true) + .before_context(1) + .after_context(1) + .build() + .search_reader( + &matcher, + SHERLOCK.as_bytes(), + printer.sink(&matcher), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + let expected = "\ +1-Sherlock +2:Holmeses, success in the province of detective work must always +3-Sherlock +4:can extract a clew from a wisp of straw or a flake of cigar ash; +5:but Doctor Watson has to have it taken out for him and dusted, +6:and exhibited clearly, with a label attached. +"; + assert_eq_printed!(expected, got); + } +} diff --git a/grep-printer/src/stats.rs b/grep-printer/src/stats.rs new file mode 100644 index 000000000..a62aead97 --- /dev/null +++ b/grep-printer/src/stats.rs @@ -0,0 +1,147 @@ +use std::ops::{Add, AddAssign}; +use std::time::Duration; + +use util::NiceDuration; + +/// Summary statistics produced at the end of a search. +/// +/// When statistics are reported by a printer, they correspond to all searches +/// executed with that printer. +#[derive(Clone, Debug, Default, PartialEq, Eq)] +#[cfg_attr(feature = "serde1", derive(Serialize))] +pub struct Stats { + elapsed: NiceDuration, + searches: u64, + searches_with_match: u64, + bytes_searched: u64, + bytes_printed: u64, + matched_lines: u64, + matches: u64, +} + +impl Add for Stats { + type Output = Stats; + + fn add(self, rhs: Stats) -> Stats { + self + &rhs + } +} + +impl<'a> Add<&'a Stats> for Stats { + type Output = Stats; + + fn add(self, rhs: &'a Stats) -> Stats { + Stats { + elapsed: NiceDuration(self.elapsed.0 + rhs.elapsed.0), + searches: self.searches + rhs.searches, + searches_with_match: + self.searches_with_match + rhs.searches_with_match, + bytes_searched: self.bytes_searched + rhs.bytes_searched, + bytes_printed: self.bytes_printed + rhs.bytes_printed, + matched_lines: self.matched_lines + rhs.matched_lines, + matches: self.matches + rhs.matches, + } + } +} + +impl AddAssign for Stats { + fn add_assign(&mut self, rhs: Stats) { + *self += &rhs; + } +} + +impl<'a> AddAssign<&'a Stats> for Stats { + fn add_assign(&mut self, rhs: &'a Stats) { + self.elapsed.0 += rhs.elapsed.0; + self.searches += rhs.searches; + self.searches_with_match += rhs.searches_with_match; + self.bytes_searched += rhs.bytes_searched; + self.bytes_printed += rhs.bytes_printed; + self.matched_lines += rhs.matched_lines; + self.matches += rhs.matches; + } +} + +impl Stats { + /// Return a new value for tracking aggregate statistics across searches. + /// + /// All statistics are set to `0`. + pub fn new() -> Stats { + Stats::default() + } + + /// Return the total amount of time elapsed. + pub fn elapsed(&self) -> Duration { + self.elapsed.0 + } + + /// Return the total number of searches executed. + pub fn searches(&self) -> u64 { + self.searches + } + + /// Return the total number of searches that found at least one match. + pub fn searches_with_match(&self) -> u64 { + self.searches_with_match + } + + /// Return the total number of bytes searched. + pub fn bytes_searched(&self) -> u64 { + self.bytes_searched + } + + /// Return the total number of bytes printed. + pub fn bytes_printed(&self) -> u64 { + self.bytes_printed + } + + /// Return the total number of lines that participated in a match. + /// + /// When matches may contain multiple lines then this includes every line + /// that is part of every match. + pub fn matched_lines(&self) -> u64 { + self.matched_lines + } + + /// Return the total number of matches. + /// + /// There may be multiple matches per line. + pub fn matches(&self) -> u64 { + self.matches + } + + /// Add to the elapsed time. + pub fn add_elapsed(&mut self, duration: Duration) { + self.elapsed.0 += duration; + } + + /// Add to the number of searches executed. + pub fn add_searches(&mut self, n: u64) { + self.searches += n; + } + + /// Add to the number of searches that found at least one match. + pub fn add_searches_with_match(&mut self, n: u64) { + self.searches_with_match += n; + } + + /// Add to the total number of bytes searched. + pub fn add_bytes_searched(&mut self, n: u64) { + self.bytes_searched += n; + } + + /// Add to the total number of bytes printed. + pub fn add_bytes_printed(&mut self, n: u64) { + self.bytes_printed += n; + } + + /// Add to the total number of lines that participated in a match. + pub fn add_matched_lines(&mut self, n: u64) { + self.matched_lines += n; + } + + /// Add to the total number of matches. + pub fn add_matches(&mut self, n: u64) { + self.matches += n; + } +} diff --git a/grep-printer/src/summary.rs b/grep-printer/src/summary.rs new file mode 100644 index 000000000..a63dbd3f5 --- /dev/null +++ b/grep-printer/src/summary.rs @@ -0,0 +1,1068 @@ +use std::cell::RefCell; +use std::io::{self, Write}; +use std::path::Path; +use std::sync::Arc; +use std::time::Instant; + +use grep_matcher::Matcher; +use grep_searcher::{Searcher, Sink, SinkError, SinkFinish, SinkMatch}; +use termcolor::{ColorSpec, NoColor, WriteColor}; + +use color::ColorSpecs; +use counter::CounterWriter; +use stats::Stats; +use util::PrinterPath; + +/// The configuration for the summary printer. +/// +/// This is manipulated by the SummaryBuilder and then referenced by the actual +/// implementation. Once a printer is build, the configuration is frozen and +/// cannot changed. +#[derive(Debug, Clone)] +struct Config { + kind: SummaryKind, + colors: ColorSpecs, + stats: bool, + path: bool, + max_matches: Option, + exclude_zero: bool, + separator_field: Arc>, + separator_path: Option, + path_terminator: Option, +} + +impl Default for Config { + fn default() -> Config { + Config { + kind: SummaryKind::Count, + colors: ColorSpecs::default(), + stats: false, + path: true, + max_matches: None, + exclude_zero: true, + separator_field: Arc::new(b":".to_vec()), + separator_path: None, + path_terminator: None, + } + } +} + +/// The type of summary output (if any) to print. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum SummaryKind { + /// Show only a count of the total number of matches (counting each line + /// at most once) found. + /// + /// If the `path` setting is enabled, then the count is prefixed by the + /// corresponding file path. + Count, + /// Show only a count of the total number of matches (counting possibly + /// many matches on each line) found. + /// + /// If the `path` setting is enabled, then the count is prefixed by the + /// corresponding file path. + CountMatches, + /// Show only the file path if and only if a match was found. + /// + /// This ignores the `path` setting and always shows the file path. If no + /// file path is provided, then searching will immediately stop and return + /// an error. + PathWithMatch, + /// Show only the file path if and only if a match was found. + /// + /// This ignores the `path` setting and always shows the file path. If no + /// file path is provided, then searching will immediately stop and return + /// an error. + PathWithoutMatch, + /// Don't show any output and the stop the search once a match is found. + /// + /// Note that if `stats` is enabled, then searching continues in order to + /// compute statistics. + Quiet, +} + +impl SummaryKind { + /// Returns true if and only if this output mode requires a file path. + /// + /// When an output mode requires a file path, then the summary printer + /// will report an error at the start of every search that lacks a file + /// path. + fn requires_path(&self) -> bool { + use self::SummaryKind::*; + + match *self { + PathWithMatch | PathWithoutMatch => true, + Count | CountMatches | Quiet => false, + } + } + + /// Returns true if and only if this output mode requires computing + /// statistics, regardless of whether they have been enabled or not. + fn requires_stats(&self) -> bool { + use self::SummaryKind::*; + + match *self { + CountMatches => true, + Count | PathWithMatch | PathWithoutMatch | Quiet => false, + } + } + + /// Returns true if and only if a printer using this output mode can + /// quit after seeing the first match. + fn quit_early(&self) -> bool { + use self::SummaryKind::*; + + match *self { + PathWithMatch | Quiet => true, + Count | CountMatches | PathWithoutMatch => false, + } + } +} + +/// A builder for summary printer. +/// +/// The builder permits configuring how the printer behaves. The summary +/// printer has fewer configuration options than the standard printer because +/// it aims to produce aggregate output about a single search (typically just +/// one line) instead of output for each match. +/// +/// Once a `Summary` printer is built, its configuration cannot be changed. +#[derive(Clone, Debug)] +pub struct SummaryBuilder { + config: Config, +} + +impl SummaryBuilder { + /// Return a new builder for configuring the summary printer. + pub fn new() -> SummaryBuilder { + SummaryBuilder { config: Config::default() } + } + + /// Build a printer using any implementation of `termcolor::WriteColor`. + /// + /// The implementation of `WriteColor` used here controls whether colors + /// are used or not when colors have been configured using the + /// `color_specs` method. + /// + /// For maximum portability, callers should generally use either + /// `termcolor::StandardStream` or `termcolor::BufferedStandardStream` + /// where appropriate, which will automatically enable colors on Windows + /// when possible. + /// + /// However, callers may also provide an arbitrary writer using the + /// `termcolor::Ansi` or `termcolor::NoColor` wrappers, which always enable + /// colors via ANSI escapes or always disable colors, respectively. + /// + /// As a convenience, callers may use `build_no_color` to automatically + /// select the `termcolor::NoColor` wrapper to avoid needing to import + /// from `termcolor` explicitly. + pub fn build(&self, wtr: W) -> Summary { + Summary { + config: self.config.clone(), + wtr: RefCell::new(CounterWriter::new(wtr)), + } + } + + /// Build a printer from any implementation of `io::Write` and never emit + /// any colors, regardless of the user color specification settings. + /// + /// This is a convenience routine for + /// `SummaryBuilder::build(termcolor::NoColor::new(wtr))`. + pub fn build_no_color( + &self, + wtr: W, + ) -> Summary> { + self.build(NoColor::new(wtr)) + } + + /// Set the output mode for this printer. + /// + /// The output mode controls how aggregate results of a search are printed. + /// + /// By default, this printer uses the `Count` mode. + pub fn kind(&mut self, kind: SummaryKind) -> &mut SummaryBuilder { + self.config.kind = kind; + self + } + + /// Set the user color specifications to use for coloring in this printer. + /// + /// A [`UserColorSpec`](struct.UserColorSpec.html) can be constructed from + /// a string in accordance with the color specification format. See the + /// `UserColorSpec` type documentation for more details on the format. + /// A [`ColorSpecs`](struct.ColorSpecs.html) can then be generated from + /// zero or more `UserColorSpec`s. + /// + /// Regardless of the color specifications provided here, whether color + /// is actually used or not is determined by the implementation of + /// `WriteColor` provided to `build`. For example, if `termcolor::NoColor` + /// is provided to `build`, then no color will ever be printed regardless + /// of the color specifications provided here. + /// + /// This completely overrides any previous color specifications. This does + /// not add to any previously provided color specifications on this + /// builder. + pub fn color_specs( + &mut self, + specs: ColorSpecs, + ) -> &mut SummaryBuilder { + self.config.colors = specs; + self + } + + /// Enable the gathering of various aggregate statistics. + /// + /// When this is enabled (it's disabled by default), statistics will be + /// gathered for all uses of `Summary` printer returned by `build`, + /// including but not limited to, the total number of matches, the total + /// number of bytes searched and the total number of bytes printed. + /// + /// Aggregate statistics can be accessed via the sink's + /// [`SummarySink::stats`](struct.SummarySink.html#method.stats) + /// method. + /// + /// When this is enabled, this printer may need to do extra work in order + /// to compute certain statistics, which could cause the search to take + /// longer. For example, in `Quiet` mode, a search can quit after finding + /// the first match, but if `stats` is enabled, then the search will + /// continue after the first match in order to compute statistics. + /// + /// For a complete description of available statistics, see + /// [`Stats`](struct.Stats.html). + /// + /// Note that some output modes, such as `CountMatches`, automatically + /// enable this option even if it has been explicitly disabled. + pub fn stats(&mut self, yes: bool) -> &mut SummaryBuilder { + self.config.stats = yes; + self + } + + /// When enabled, if a path was given to the printer, then it is shown in + /// the output (either as a heading or as a prefix to each matching line). + /// When disabled, then no paths are ever included in the output even when + /// a path is provided to the printer. + /// + /// This setting has no effect in `PathWithMatch` and `PathWithoutMatch` + /// modes. + /// + /// This is enabled by default. + pub fn path(&mut self, yes: bool) -> &mut SummaryBuilder { + self.config.path = yes; + self + } + + /// Set the maximum amount of matches that are printed. + /// + /// If multi line search is enabled and a match spans multiple lines, then + /// that match is counted exactly once for the purposes of enforcing this + /// limit, regardless of how many lines it spans. + pub fn max_matches(&mut self, limit: Option) -> &mut SummaryBuilder { + self.config.max_matches = limit; + self + } + + /// Exclude count-related summary results with no matches. + /// + /// When enabled and the mode is either `Count` or `CountMatches`, then + /// results are not printed if no matches were found. Otherwise, every + /// search prints a result with a possibly `0` number of matches. + pub fn exclude_zero(&mut self, yes: bool) -> &mut SummaryBuilder { + self.config.exclude_zero = yes; + self + } + + /// Set the separator used between fields for the `Count` and + /// `CountMatches` modes. + /// + /// By default, this is set to `:`. + pub fn separator_field( + &mut self, + sep: Vec, + ) -> &mut SummaryBuilder { + self.config.separator_field = Arc::new(sep); + self + } + + /// Set the path separator used when printing file paths. + /// + /// Typically, printing is done by emitting the file path as is. However, + /// this setting provides the ability to use a different path separator + /// from what the current environment has configured. + /// + /// A typical use for this option is to permit cygwin users on Windows to + /// set the path separator to `/` instead of using the system default of + /// `\`. + pub fn separator_path( + &mut self, + sep: Option, + ) -> &mut SummaryBuilder { + self.config.separator_path = sep; + self + } + + /// Set the path terminator used. + /// + /// The path terminator is a byte that is printed after every file path + /// emitted by this printer. + /// + /// If no path terminator is set (the default), then paths are terminated + /// by either new lines or the configured field separator. + pub fn path_terminator( + &mut self, + terminator: Option, + ) -> &mut SummaryBuilder { + self.config.path_terminator = terminator; + self + } +} + +/// The summary printer, which emits aggregate results from a search. +/// +/// Aggregate results generally correspond to file paths and/or the number of +/// matches found. +/// +/// A default printer can be created with either of the `Summary::new` or +/// `Summary::new_no_color` constructors. However, there are a number of +/// options that configure this printer's output. Those options can be +/// configured using [`SummaryBuilder`](struct.SummaryBuilder.html). +/// +/// This type is generic over `W`, which represents any implementation of +/// the `termcolor::WriteColor` trait. +#[derive(Debug)] +pub struct Summary { + config: Config, + wtr: RefCell>, +} + +impl Summary { + /// Return a summary printer with a default configuration that writes + /// matches to the given writer. + /// + /// The writer should be an implementation of `termcolor::WriteColor` + /// and not just a bare implementation of `io::Write`. To use a normal + /// `io::Write` implementation (simultaneously sacrificing colors), use + /// the `new_no_color` constructor. + /// + /// The default configuration uses the `Count` summary mode. + pub fn new(wtr: W) -> Summary { + SummaryBuilder::new().build(wtr) + } +} + +impl Summary> { + /// Return a summary printer with a default configuration that writes + /// matches to the given writer. + /// + /// The writer can be any implementation of `io::Write`. With this + /// constructor, the printer will never emit colors. + /// + /// The default configuration uses the `Count` summary mode. + pub fn new_no_color(wtr: W) -> Summary> { + SummaryBuilder::new().build_no_color(wtr) + } +} + +impl Summary { + /// Return an implementation of `Sink` for the summary printer. + /// + /// This does not associate the printer with a file path, which means this + /// implementation will never print a file path. If the output mode of + /// this summary printer does not make sense without a file path (such as + /// `PathWithMatch` or `PathWithoutMatch`), then any searches executed + /// using this sink will immediately quit with an error. + pub fn sink<'s, M: Matcher>( + &'s mut self, + matcher: M, + ) -> SummarySink<'static, 's, M, W> { + let stats = + if self.config.stats || self.config.kind.requires_stats() { + Some(Stats::new()) + } else { + None + }; + SummarySink { + matcher: matcher, + summary: self, + path: None, + start_time: Instant::now(), + match_count: 0, + binary_byte_offset: None, + stats: stats, + } + } + + /// Return an implementation of `Sink` associated with a file path. + /// + /// When the printer is associated with a path, then it may, depending on + /// its configuration, print the path. + pub fn sink_with_path<'p, 's, M, P>( + &'s mut self, + matcher: M, + path: &'p P, + ) -> SummarySink<'p, 's, M, W> + where M: Matcher, + P: ?Sized + AsRef, + { + if !self.config.path { + return self.sink(matcher); + } + let stats = + if self.config.stats || self.config.kind.requires_stats() { + Some(Stats::new()) + } else { + None + }; + let ppath = PrinterPath::with_separator( + path.as_ref(), self.config.separator_path); + SummarySink { + matcher: matcher, + summary: self, + path: Some(ppath), + start_time: Instant::now(), + match_count: 0, + binary_byte_offset: None, + stats: stats, + } + } +} + +impl Summary { + /// Returns true if and only if this printer has written at least one byte + /// to the underlying writer during any of the previous searches. + pub fn has_written(&self) -> bool { + self.wtr.borrow().total_count() > 0 + } + + /// Return a mutable reference to the underlying writer. + pub fn get_mut(&mut self) -> &mut W { + self.wtr.get_mut().get_mut() + } + + /// Consume this printer and return back ownership of the underlying + /// writer. + pub fn into_inner(self) -> W { + self.wtr.into_inner().into_inner() + } +} + +/// An implementation of `Sink` associated with a matcher and an optional file +/// path for the summary printer. +/// +/// This type is generic over a few type parameters: +/// +/// * `'p` refers to the lifetime of the file path, if one is provided. When +/// no file path is given, then this is `'static`. +/// * `'s` refers to the lifetime of the +/// [`Summary`](struct.Summary.html) +/// printer that this type borrows. +/// * `M` refers to the type of matcher used by +/// `grep_searcher::Searcher` that is reporting results to this sink. +/// * `W` refers to the underlying writer that this printer is writing its +/// output to. +#[derive(Debug)] +pub struct SummarySink<'p, 's, M: Matcher, W: 's> { + matcher: M, + summary: &'s mut Summary, + path: Option>, + start_time: Instant, + match_count: u64, + binary_byte_offset: Option, + stats: Option, +} + +impl<'p, 's, M: Matcher, W: WriteColor> SummarySink<'p, 's, M, W> { + /// Returns true if and only if this printer received a match in the + /// previous search. + /// + /// This is unaffected by the result of searches before the previous + /// search. + pub fn has_match(&self) -> bool { + self.match_count > 0 + } + + /// If binary data was found in the previous search, this returns the + /// offset at which the binary data was first detected. + /// + /// The offset returned is an absolute offset relative to the entire + /// set of bytes searched. + /// + /// This is unaffected by the result of searches before the previous + /// search. e.g., If the search prior to the previous search found binary + /// data but the previous search found no binary data, then this will + /// return `None`. + pub fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset + } + + /// Return a reference to the stats produced by the printer for all + /// searches executed on this sink. + /// + /// This only returns stats if they were requested via the + /// [`SummaryBuilder`](struct.SummaryBuilder.html) + /// configuration. + pub fn stats(&self) -> Option<&Stats> { + self.stats.as_ref() + } + + /// Returns true if this printer should quit. + /// + /// This implements the logic for handling quitting after seeing a certain + /// amount of matches. In most cases, the logic is simple, but we must + /// permit all "after" contextual lines to print after reaching the limit. + fn should_quit(&self) -> bool { + let limit = match self.summary.config.max_matches { + None => return false, + Some(limit) => limit, + }; + self.match_count >= limit + } + + /// If this printer has a file path associated with it, then this will + /// write that path to the underlying writer followed by a line terminator. + /// (If a path terminator is set, then that is used instead of the line + /// terminator.) + fn write_path_line(&self, searcher: &Searcher) -> io::Result<()> { + if let Some(ref path) = self.path { + self.write_spec( + self.summary.config.colors.path(), + path.as_bytes(), + )?; + if let Some(term) = self.summary.config.path_terminator { + self.write(&[term])?; + } else { + self.write_line_term(searcher)?; + } + } + Ok(()) + } + + /// If this printer has a file path associated with it, then this will + /// write that path to the underlying writer followed by the field + /// separator. (If a path terminator is set, then that is used instead of + /// the field separator.) + fn write_path_field(&self) -> io::Result<()> { + if let Some(ref path) = self.path { + self.write_spec( + self.summary.config.colors.path(), + path.as_bytes(), + )?; + if let Some(term) = self.summary.config.path_terminator { + self.write(&[term])?; + } else { + self.write(&self.summary.config.separator_field)?; + } + } + Ok(()) + } + + /// Write the line terminator configured on the given searcher. + fn write_line_term(&self, searcher: &Searcher) -> io::Result<()> { + self.write(searcher.line_terminator().as_bytes()) + } + + /// Write the given bytes using the give style. + fn write_spec(&self, spec: &ColorSpec, buf: &[u8]) -> io::Result<()> { + self.summary.wtr.borrow_mut().set_color(spec)?; + self.write(buf)?; + self.summary.wtr.borrow_mut().reset()?; + Ok(()) + } + + /// Write all of the given bytes. + fn write(&self, buf: &[u8]) -> io::Result<()> { + self.summary.wtr.borrow_mut().write_all(buf) + } +} + +impl<'p, 's, M: Matcher, W: WriteColor> Sink for SummarySink<'p, 's, M, W> { + type Error = io::Error; + + fn matched( + &mut self, + _searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + self.match_count += 1; + if let Some(ref mut stats) = self.stats { + let mut match_count = 0; + self.matcher.find_iter(mat.bytes(), |_| { + match_count += 1; + true + }).map_err(io::Error::error_message)?; + stats.add_matches(match_count); + stats.add_matched_lines(mat.lines().count() as u64); + } else if self.summary.config.kind.quit_early() { + return Ok(false); + } + Ok(!self.should_quit()) + } + + fn begin( + &mut self, + _searcher: &Searcher, + ) -> Result { + if self.path.is_none() && self.summary.config.kind.requires_path() { + return Err(io::Error::error_message(format!( + "output kind {:?} requires a file path", + self.summary.config.kind, + ))); + } + self.summary.wtr.borrow_mut().reset_count(); + self.start_time = Instant::now(); + self.match_count = 0; + self.binary_byte_offset = None; + if self.summary.config.max_matches == Some(0) { + return Ok(false); + } + + Ok(true) + } + + fn finish( + &mut self, + searcher: &Searcher, + finish: &SinkFinish, + ) -> Result<(), io::Error> { + self.binary_byte_offset = finish.binary_byte_offset(); + if let Some(ref mut stats) = self.stats { + stats.add_elapsed(self.start_time.elapsed()); + stats.add_searches(1); + if self.match_count > 0 { + stats.add_searches_with_match(1); + } + stats.add_bytes_searched(finish.byte_count()); + stats.add_bytes_printed(self.summary.wtr.borrow().count()); + } + + let show_count = + !self.summary.config.exclude_zero + || self.match_count > 0; + match self.summary.config.kind { + SummaryKind::Count => { + if show_count { + self.write_path_field()?; + self.write(self.match_count.to_string().as_bytes())?; + self.write_line_term(searcher)?; + } + } + SummaryKind::CountMatches => { + if show_count { + let stats = self.stats + .as_ref() + .expect("CountMatches should enable stats tracking"); + self.write_path_field()?; + self.write(stats.matches().to_string().as_bytes())?; + self.write_line_term(searcher)?; + } + } + SummaryKind::PathWithMatch => { + if self.match_count > 0 { + self.write_path_line(searcher)?; + } + } + SummaryKind::PathWithoutMatch => { + if self.match_count == 0 { + self.write_path_line(searcher)?; + } + } + SummaryKind::Quiet => {} + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use grep_regex::RegexMatcher; + use grep_searcher::SearcherBuilder; + use termcolor::NoColor; + + use super::{Summary, SummaryKind, SummaryBuilder}; + + const SHERLOCK: &'static [u8] = b"\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + + fn printer_contents( + printer: &mut Summary>>, + ) -> String { + String::from_utf8(printer.get_mut().get_ref().to_owned()).unwrap() + } + + #[test] + fn path_with_match_error() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithMatch) + .build_no_color(vec![]); + let res = SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)); + assert!(res.is_err()); + } + + #[test] + fn path_without_match_error() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithoutMatch) + .build_no_color(vec![]); + let res = SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)); + assert!(res.is_err()); + } + + #[test] + fn count_no_path() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("2\n", got); + } + + #[test] + fn count_no_path_even_with_path() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .path(false) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("2\n", got); + } + + #[test] + fn count_path() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock:2\n", got); + } + + #[test] + fn count_path_with_zero() { + let matcher = RegexMatcher::new( + r"NO MATCH" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .exclude_zero(false) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock:0\n", got); + } + + #[test] + fn count_path_without_zero() { + let matcher = RegexMatcher::new( + r"NO MATCH" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .exclude_zero(true) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("", got); + } + + #[test] + fn count_path_field_separator() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .separator_field(b"ZZ".to_vec()) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlockZZ2\n", got); + } + + #[test] + fn count_path_terminator() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .path_terminator(Some(b'\x00')) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock\x002\n", got); + } + + #[test] + fn count_path_separator() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .separator_path(Some(b'\\')) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "/home/andrew/sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("\\home\\andrew\\sherlock:2\n", got); + } + + #[test] + fn count_max_matches() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Count) + .max_matches(Some(1)) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, printer.sink(&matcher)) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("1\n", got); + } + + #[test] + fn count_matches() { + let matcher = RegexMatcher::new( + r"Watson|Sherlock" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::CountMatches) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock:4\n", got); + } + + #[test] + fn path_with_match_found() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithMatch) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock\n", got); + } + + #[test] + fn path_with_match_not_found() { + let matcher = RegexMatcher::new( + r"ZZZZZZZZ" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithMatch) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("", got); + } + + + #[test] + fn path_without_match_found() { + let matcher = RegexMatcher::new( + r"ZZZZZZZZZ" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithoutMatch) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("sherlock\n", got); + } + + #[test] + fn path_without_match_not_found() { + let matcher = RegexMatcher::new( + r"Watson" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::PathWithoutMatch) + .build_no_color(vec![]); + SearcherBuilder::new() + .build() + .search_reader( + &matcher, + SHERLOCK, + printer.sink_with_path(&matcher, "sherlock"), + ) + .unwrap(); + + let got = printer_contents(&mut printer); + assert_eq_printed!("", got); + } + + #[test] + fn quiet() { + let matcher = RegexMatcher::new( + r"Watson|Sherlock" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Quiet) + .build_no_color(vec![]); + let match_count = { + let mut sink = printer.sink_with_path(&matcher, "sherlock"); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, &mut sink) + .unwrap(); + sink.match_count + }; + + let got = printer_contents(&mut printer); + assert_eq_printed!("", got); + // There is actually more than one match, but Quiet should quit after + // finding the first one. + assert_eq!(1, match_count); + } + + #[test] + fn quiet_with_stats() { + let matcher = RegexMatcher::new( + r"Watson|Sherlock" + ).unwrap(); + let mut printer = SummaryBuilder::new() + .kind(SummaryKind::Quiet) + .stats(true) + .build_no_color(vec![]); + let match_count = { + let mut sink = printer.sink_with_path(&matcher, "sherlock"); + SearcherBuilder::new() + .build() + .search_reader(&matcher, SHERLOCK, &mut sink) + .unwrap(); + sink.match_count + }; + + let got = printer_contents(&mut printer); + assert_eq_printed!("", got); + // There is actually more than one match, and Quiet will usually quit + // after finding the first one, but since we request stats, it will + // mush on to find all matches. + assert_eq!(3, match_count); + } +} diff --git a/grep-printer/src/util.rs b/grep-printer/src/util.rs new file mode 100644 index 000000000..e2d80c7ff --- /dev/null +++ b/grep-printer/src/util.rs @@ -0,0 +1,392 @@ +use std::borrow::Cow; +use std::fmt; +use std::io; +use std::path::Path; +use std::time; + +use grep_matcher::{Captures, LineTerminator, Match, Matcher}; +use grep_searcher::{ + LineIter, + SinkError, SinkContext, SinkContextKind, SinkMatch, +}; +#[cfg(feature = "serde1")] +use serde::{Serialize, Serializer}; + +/// A type for handling replacements while amortizing allocation. +pub struct Replacer { + space: Option>, +} + +struct Space { + /// The place to store capture locations. + caps: M::Captures, + /// The place to write a replacement to. + dst: Vec, + /// The place to store match offsets in terms of `dst`. + matches: Vec, +} + +impl fmt::Debug for Replacer { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let (dst, matches) = self.replacement().unwrap_or((&[], &[])); + f.debug_struct("Replacer") + .field("dst", &dst) + .field("matches", &matches) + .finish() + } +} + +impl Replacer { + /// Create a new replacer for use with a particular matcher. + /// + /// This constructor does not allocate. Instead, space for dealing with + /// replacements is allocated lazily only when needed. + pub fn new() -> Replacer { + Replacer { space: None } + } + + /// Executes a replacement on the given subject string by replacing all + /// matches with the given replacement. To access the result of the + /// replacement, use the `replacement` method. + /// + /// This can fail if the underlying matcher reports an error. + pub fn replace_all<'a>( + &'a mut self, + matcher: &M, + subject: &[u8], + replacement: &[u8], + ) -> io::Result<()> { + { + let &mut Space { + ref mut dst, + ref mut caps, + ref mut matches, + } = self.allocate(matcher)?; + dst.clear(); + matches.clear(); + + matcher.replace_with_captures( + subject, + caps, + dst, + |caps, dst| { + let start = dst.len(); + caps.interpolate( + |name| matcher.capture_index(name), + subject, + replacement, + dst, + ); + let end = dst.len(); + matches.push(Match::new(start, end)); + true + }, + ).map_err(io::Error::error_message)?; + } + Ok(()) + } + + /// Return the result of the prior replacement and the match offsets for + /// all replacement occurrences within the returned replacement buffer. + /// + /// If no replacement has occurred then `None` is returned. + pub fn replacement<'a>(&'a self) -> Option<(&'a [u8], &'a [Match])> { + match self.space { + None => None, + Some(ref space) => { + if space.matches.is_empty() { + None + } else { + Some((&space.dst, &space.matches)) + } + } + } + } + + /// Clear space used for performing a replacement. + /// + /// Subsequent calls to `replacement` after calling `clear` (but before + /// executing another replacement) will always return `None`. + pub fn clear(&mut self) { + if let Some(ref mut space) = self.space { + space.dst.clear(); + space.matches.clear(); + } + } + + /// Allocate space for replacements when used with the given matcher and + /// return a mutable reference to that space. + /// + /// This can fail if allocating space for capture locations from the given + /// matcher fails. + fn allocate(&mut self, matcher: &M) -> io::Result<&mut Space> { + if self.space.is_none() { + let caps = matcher + .new_captures() + .map_err(io::Error::error_message)?; + self.space = Some(Space { + caps: caps, + dst: vec![], + matches: vec![], + }); + } + Ok(self.space.as_mut().unwrap()) + } +} + +/// A simple layer of abstraction over either a match or a contextual line +/// reported by the searcher. +/// +/// In particular, this provides an API that unions the `SinkMatch` and +/// `SinkContext` types while also exposing a list of all individual match +/// locations. +/// +/// While this serves as a convenient mechanism to abstract over `SinkMatch` +/// and `SinkContext`, this also provides a way to abstract over replacements. +/// Namely, after a replacement, a `Sunk` value can be constructed using the +/// results of the replacement instead of the bytes reported directly by the +/// searcher. +#[derive(Debug)] +pub struct Sunk<'a> { + bytes: &'a [u8], + absolute_byte_offset: u64, + line_number: Option, + context_kind: Option<&'a SinkContextKind>, + matches: &'a [Match], + original_matches: &'a [Match], +} + +impl<'a> Sunk<'a> { + #[inline] + pub fn empty() -> Sunk<'static> { + Sunk { + bytes: &[], + absolute_byte_offset: 0, + line_number: None, + context_kind: None, + matches: &[], + original_matches: &[], + } + } + + #[inline] + pub fn from_sink_match( + sunk: &'a SinkMatch<'a>, + original_matches: &'a [Match], + replacement: Option<(&'a [u8], &'a [Match])>, + ) -> Sunk<'a> { + let (bytes, matches) = replacement.unwrap_or_else(|| { + (sunk.bytes(), original_matches) + }); + Sunk { + bytes: bytes, + absolute_byte_offset: sunk.absolute_byte_offset(), + line_number: sunk.line_number(), + context_kind: None, + matches: matches, + original_matches: original_matches, + } + } + + #[inline] + pub fn from_sink_context( + sunk: &'a SinkContext<'a>, + original_matches: &'a [Match], + replacement: Option<(&'a [u8], &'a [Match])>, + ) -> Sunk<'a> { + let (bytes, matches) = replacement.unwrap_or_else(|| { + (sunk.bytes(), original_matches) + }); + Sunk { + bytes: bytes, + absolute_byte_offset: sunk.absolute_byte_offset(), + line_number: sunk.line_number(), + context_kind: Some(sunk.kind()), + matches: matches, + original_matches: original_matches, + } + } + + #[inline] + pub fn context_kind(&self) -> Option<&'a SinkContextKind> { + self.context_kind + } + + #[inline] + pub fn bytes(&self) -> &'a [u8] { + self.bytes + } + + #[inline] + pub fn matches(&self) -> &'a [Match] { + self.matches + } + + #[inline] + pub fn original_matches(&self) -> &'a [Match] { + self.original_matches + } + + #[inline] + pub fn lines(&self, line_term: u8) -> LineIter<'a> { + LineIter::new(line_term, self.bytes()) + } + + #[inline] + pub fn absolute_byte_offset(&self) -> u64 { + self.absolute_byte_offset + } + + #[inline] + pub fn line_number(&self) -> Option { + self.line_number + } +} + +/// A simple encapsulation of a file path used by a printer. +/// +/// This represents any transforms that we might want to perform on the path, +/// such as converting it to valid UTF-8 and/or replacing its separator with +/// something else. This allows us to amortize work if we are printing the +/// file path for every match. +/// +/// In the common case, no transformation is needed, which lets us avoid the +/// allocation. Typically, only Windows requires a transform, since we can't +/// access the raw bytes of a path directly and first need to lossily convert +/// to UTF-8. Windows is also typically where the path separator replacement +/// is used, e.g., in cygwin environments to use `/` instead of `\`. +/// +/// Users of this type are expected to construct it from a normal `Path` +/// found in the standard library. It can then be written to any `io::Write` +/// implementation using the `as_bytes` method. This achieves platform +/// portability with a small cost: on Windows, paths that are not valid UTF-16 +/// will not roundtrip correctly. +#[derive(Clone, Debug)] +pub struct PrinterPath<'a>(Cow<'a, [u8]>); + +impl<'a> PrinterPath<'a> { + /// Create a new path suitable for printing. + pub fn new(path: &'a Path) -> PrinterPath<'a> { + PrinterPath::new_impl(path) + } + + #[cfg(unix)] + fn new_impl(path: &'a Path) -> PrinterPath<'a> { + use std::os::unix::ffi::OsStrExt; + PrinterPath(Cow::Borrowed(path.as_os_str().as_bytes())) + } + + #[cfg(not(unix))] + fn new_impl(path: &'a Path) -> PrinterPath<'a> { + PrinterPath(match path.to_string_lossy() { + Cow::Owned(path) => Cow::Owned(path.into_bytes()), + Cow::Borrowed(path) => Cow::Borrowed(path.as_bytes()), + }) + } + + /// Create a new printer path from the given path which can be efficiently + /// written to a writer without allocation. + /// + /// If the given separator is present, then any separators in `path` are + /// replaced with it. + pub fn with_separator(path: &'a Path, sep: Option) -> PrinterPath<'a> { + let mut ppath = PrinterPath::new(path); + if let Some(sep) = sep { + ppath.replace_separator(sep); + } + ppath + } + + /// Replace the path separator in this path with the given separator + /// and do it in place. On Windows, both `/` and `\` are treated as + /// path separators that are both replaced by `new_sep`. In all other + /// environments, only `/` is treated as a path separator. + fn replace_separator(&mut self, new_sep: u8) { + let transformed_path: Vec<_> = self.as_bytes().iter().map(|&b| { + if b == b'/' || (cfg!(windows) && b == b'\\') { + new_sep + } else { + b + } + }).collect(); + self.0 = Cow::Owned(transformed_path); + } + + /// Return the raw bytes for this path. + pub fn as_bytes(&self) -> &[u8] { + &*self.0 + } +} + +/// A type that provides "nicer" Display and Serialize impls for +/// std::time::Duration. The serialization format should actually be compatible +/// with the Deserialize impl for std::time::Duration, since this type only +/// adds new fields. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +pub struct NiceDuration(pub time::Duration); + +impl fmt::Display for NiceDuration { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{:0.6}s", self.fractional_seconds()) + } +} + +impl NiceDuration { + /// Returns the number of seconds in this duration in fraction form. + /// The number to the left of the decimal point is the number of seconds, + /// and the number to the right is the number of milliseconds. + fn fractional_seconds(&self) -> f64 { + let fractional = (self.0.subsec_nanos() as f64) / 1_000_000_000.0; + self.0.as_secs() as f64 + fractional + } +} + +#[cfg(feature = "serde1")] +impl Serialize for NiceDuration { + fn serialize(&self, ser: S) -> Result { + use serde::ser::SerializeStruct; + + let mut state = ser.serialize_struct("Duration", 2)?; + state.serialize_field("secs", &self.0.as_secs())?; + state.serialize_field("nanos", &self.0.subsec_nanos())?; + state.serialize_field("human", &format!("{}", self))?; + state.end() + } +} + +/// Trim prefix ASCII spaces from the given slice and return the corresponding +/// range. +/// +/// This stops trimming a prefix as soon as it sees non-whitespace or a line +/// terminator. +pub fn trim_ascii_prefix_range( + line_term: LineTerminator, + slice: &[u8], + range: Match, +) -> Match { + fn is_space(b: u8) -> bool { + match b { + b'\t' | b'\n' | b'\x0B' | b'\x0C' | b'\r' | b' ' => true, + _ => false, + } + } + + let count = slice[range] + .iter() + .take_while(|&&b| -> bool { + is_space(b) && !line_term.as_bytes().contains(&b) + }) + .count(); + range.with_start(range.start() + count) +} + +/// Trim prefix ASCII spaces from the given slice and return the corresponding +/// sub-slice. +pub fn trim_ascii_prefix(line_term: LineTerminator, slice: &[u8]) -> &[u8] { + let range = trim_ascii_prefix_range( + line_term, + slice, + Match::new(0, slice.len()), + ); + &slice[range] +} diff --git a/grep-regex/Cargo.toml b/grep-regex/Cargo.toml new file mode 100644 index 000000000..e39c68ddf --- /dev/null +++ b/grep-regex/Cargo.toml @@ -0,0 +1,21 @@ +[package] +name = "grep-regex" +version = "0.0.1" #:version +authors = ["Andrew Gallant "] +description = """ +Use Rust's regex library with the 'grep' crate. +""" +documentation = "https://docs.rs/grep-regex" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "search", "pattern", "line"] +license = "Unlicense/MIT" + +[dependencies] +log = "0.4" +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +regex = "1" +regex-syntax = "0.6" +thread_local = "0.3.5" +utf8-ranges = "1" diff --git a/grep-regex/LICENSE-MIT b/grep-regex/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep-regex/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-regex/README.md b/grep-regex/README.md new file mode 100644 index 000000000..7940c8677 --- /dev/null +++ b/grep-regex/README.md @@ -0,0 +1,35 @@ +grep-regex +---------- +The `grep-regex` crate provides an implementation of the `Matcher` trait from +the `grep-matcher` crate. This implementation permits Rust's regex engine to +be used in the `grep` crate for fast line oriented searching. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-regex.svg)](https://crates.io/crates/grep-regex) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-regex](https://docs.rs/grep-regex) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-regex = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_regex; +``` diff --git a/grep-regex/UNLICENSE b/grep-regex/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep-regex/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-regex/src/ast.rs b/grep-regex/src/ast.rs new file mode 100644 index 000000000..4e6067ee9 --- /dev/null +++ b/grep-regex/src/ast.rs @@ -0,0 +1,263 @@ +use regex_syntax::ast::{self, Ast}; +use regex_syntax::ast::parse::Parser; + +/// The results of analyzing AST of a regular expression (e.g., for supporting +/// smart case). +#[derive(Clone, Debug)] +pub struct AstAnalysis { + /// True if and only if a literal uppercase character occurs in the regex. + any_uppercase: bool, + /// True if and only if the regex contains any literal at all. + any_literal: bool, + /// True if and only if the regex consists entirely of a literal and no + /// other special regex characters. + all_verbatim_literal: bool, +} + +impl AstAnalysis { + /// Returns a `AstAnalysis` value by doing analysis on the AST of `pattern`. + /// + /// If `pattern` is not a valid regular expression, then `None` is + /// returned. + #[allow(dead_code)] + pub fn from_pattern(pattern: &str) -> Option { + Parser::new() + .parse(pattern) + .map(|ast| AstAnalysis::from_ast(&ast)) + .ok() + } + + /// Perform an AST analysis given the AST. + pub fn from_ast(ast: &Ast) -> AstAnalysis { + let mut analysis = AstAnalysis::new(); + analysis.from_ast_impl(ast); + analysis + } + + /// Returns true if and only if a literal uppercase character occurs in + /// the pattern. + /// + /// For example, a pattern like `\pL` contains no uppercase literals, + /// even though `L` is uppercase and the `\pL` class contains uppercase + /// characters. + pub fn any_uppercase(&self) -> bool { + self.any_uppercase + } + + /// Returns true if and only if the regex contains any literal at all. + /// + /// For example, a pattern like `\pL` reports `false`, but a pattern like + /// `\pLfoo` reports `true`. + pub fn any_literal(&self) -> bool { + self.any_literal + } + + /// Returns true if and only if the entire pattern is a verbatim literal + /// with no special meta characters. + /// + /// When this is true, then the pattern satisfies the following law: + /// `escape(pattern) == pattern`. Notable examples where this returns + /// `false` include patterns like `a\u0061` even though `\u0061` is just + /// a literal `a`. + /// + /// The purpose of this flag is to determine whether the patterns can be + /// given to non-regex substring search algorithms as-is. + #[allow(dead_code)] + pub fn all_verbatim_literal(&self) -> bool { + self.all_verbatim_literal + } + + /// Creates a new `AstAnalysis` value with an initial configuration. + fn new() -> AstAnalysis { + AstAnalysis { + any_uppercase: false, + any_literal: false, + all_verbatim_literal: true, + } + } + + fn from_ast_impl(&mut self, ast: &Ast) { + if self.done() { + return; + } + match *ast { + Ast::Empty(_) => {} + Ast::Flags(_) + | Ast::Dot(_) + | Ast::Assertion(_) + | Ast::Class(ast::Class::Unicode(_)) + | Ast::Class(ast::Class::Perl(_)) => { + self.all_verbatim_literal = false; + } + Ast::Literal(ref x) => { + self.from_ast_literal(x); + } + Ast::Class(ast::Class::Bracketed(ref x)) => { + self.all_verbatim_literal = false; + self.from_ast_class_set(&x.kind); + } + Ast::Repetition(ref x) => { + self.all_verbatim_literal = false; + self.from_ast_impl(&x.ast); + } + Ast::Group(ref x) => { + self.all_verbatim_literal = false; + self.from_ast_impl(&x.ast); + } + Ast::Alternation(ref alt) => { + self.all_verbatim_literal = false; + for x in &alt.asts { + self.from_ast_impl(x); + } + } + Ast::Concat(ref alt) => { + for x in &alt.asts { + self.from_ast_impl(x); + } + } + } + } + + fn from_ast_class_set(&mut self, ast: &ast::ClassSet) { + if self.done() { + return; + } + match *ast { + ast::ClassSet::Item(ref item) => { + self.from_ast_class_set_item(item); + } + ast::ClassSet::BinaryOp(ref x) => { + self.from_ast_class_set(&x.lhs); + self.from_ast_class_set(&x.rhs); + } + } + } + + fn from_ast_class_set_item(&mut self, ast: &ast::ClassSetItem) { + if self.done() { + return; + } + match *ast { + ast::ClassSetItem::Empty(_) + | ast::ClassSetItem::Ascii(_) + | ast::ClassSetItem::Unicode(_) + | ast::ClassSetItem::Perl(_) => {} + ast::ClassSetItem::Literal(ref x) => { + self.from_ast_literal(x); + } + ast::ClassSetItem::Range(ref x) => { + self.from_ast_literal(&x.start); + self.from_ast_literal(&x.end); + } + ast::ClassSetItem::Bracketed(ref x) => { + self.from_ast_class_set(&x.kind); + } + ast::ClassSetItem::Union(ref union) => { + for x in &union.items { + self.from_ast_class_set_item(x); + } + } + } + } + + fn from_ast_literal(&mut self, ast: &ast::Literal) { + if ast.kind != ast::LiteralKind::Verbatim { + self.all_verbatim_literal = false; + } + self.any_literal = true; + self.any_uppercase = self.any_uppercase || ast.c.is_uppercase(); + } + + /// Returns true if and only if the attributes can never change no matter + /// what other AST it might see. + fn done(&self) -> bool { + self.any_uppercase && self.any_literal && !self.all_verbatim_literal + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn analysis(pattern: &str) -> AstAnalysis { + AstAnalysis::from_pattern(pattern).unwrap() + } + + #[test] + fn various() { + let x = analysis(""); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("foo"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("Foo"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis("foO"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(x.all_verbatim_literal); + + let x = analysis(r"foo\\"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\w"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\S"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\p{Ll}"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[a-z]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[A-Z]"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo[\S\t]"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"foo\\S"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"\p{Ll}"); + assert!(!x.any_uppercase); + assert!(!x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"aBc\w"); + assert!(x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + + let x = analysis(r"a\u0061"); + assert!(!x.any_uppercase); + assert!(x.any_literal); + assert!(!x.all_verbatim_literal); + } +} diff --git a/grep-regex/src/config.rs b/grep-regex/src/config.rs new file mode 100644 index 000000000..f3d1f1c10 --- /dev/null +++ b/grep-regex/src/config.rs @@ -0,0 +1,265 @@ +use grep_matcher::{ByteSet, LineTerminator}; +use regex::bytes::{Regex, RegexBuilder}; +use regex_syntax::ast::{self, Ast}; +use regex_syntax::hir::Hir; + +use ast::AstAnalysis; +use crlf::crlfify; +use error::Error; +use literal::LiteralSets; +use non_matching::non_matching_bytes; +use strip::strip_from_match; + +/// Config represents the configuration of a regex matcher in this crate. +/// The configuration is itself a rough combination of the knobs found in +/// the `regex` crate itself, along with additional `grep-matcher` specific +/// options. +/// +/// The configuration can be used to build a "configured" HIR expression. A +/// configured HIR expression is an HIR expression that is aware of the +/// configuration which generated it, and provides transformation on that HIR +/// such that the configuration is preserved. +#[derive(Clone, Debug)] +pub struct Config { + pub case_insensitive: bool, + pub case_smart: bool, + pub multi_line: bool, + pub dot_matches_new_line: bool, + pub swap_greed: bool, + pub ignore_whitespace: bool, + pub unicode: bool, + pub octal: bool, + pub size_limit: usize, + pub dfa_size_limit: usize, + pub nest_limit: u32, + pub line_terminator: Option, + pub crlf: bool, + pub word: bool, +} + +impl Default for Config { + fn default() -> Config { + Config { + case_insensitive: false, + case_smart: false, + multi_line: false, + dot_matches_new_line: false, + swap_greed: false, + ignore_whitespace: false, + unicode: true, + octal: false, + // These size limits are much bigger than what's in the regex + // crate. + size_limit: 100 * (1<<20), + dfa_size_limit: 1000 * (1<<20), + nest_limit: 250, + line_terminator: None, + crlf: false, + word: false, + } + } +} + +impl Config { + /// Parse the given pattern and returned its HIR expression along with + /// the current configuration. + /// + /// If there was a problem parsing the given expression then an error + /// is returned. + pub fn hir(&self, pattern: &str) -> Result { + let analysis = self.analysis(pattern)?; + let expr = ::regex_syntax::ParserBuilder::new() + .nest_limit(self.nest_limit) + .octal(self.octal) + .allow_invalid_utf8(true) + .ignore_whitespace(self.ignore_whitespace) + .case_insensitive(self.is_case_insensitive(&analysis)?) + .multi_line(self.multi_line) + .dot_matches_new_line(self.dot_matches_new_line) + .swap_greed(self.swap_greed) + .unicode(self.unicode) + .build() + .parse(pattern) + .map_err(Error::regex)?; + let expr = match self.line_terminator { + None => expr, + Some(line_term) => strip_from_match(expr, line_term)?, + }; + Ok(ConfiguredHIR { + original: pattern.to_string(), + config: self.clone(), + analysis: analysis, + // If CRLF mode is enabled, replace `$` with `(?:\r?$)`. + expr: if self.crlf { crlfify(expr) } else { expr }, + }) + } + + /// Accounting for the `smart_case` config knob, return true if and only if + /// this pattern should be matched case insensitively. + fn is_case_insensitive( + &self, + analysis: &AstAnalysis, + ) -> Result { + if self.case_insensitive { + return Ok(true); + } + if !self.case_smart { + return Ok(false); + } + Ok(analysis.any_literal() && !analysis.any_uppercase()) + } + + /// Perform analysis on the AST of this pattern. + /// + /// This returns an error if the given pattern failed to parse. + fn analysis(&self, pattern: &str) -> Result { + Ok(AstAnalysis::from_ast(&self.ast(pattern)?)) + } + + /// Parse the given pattern into its abstract syntax. + /// + /// This returns an error if the given pattern failed to parse. + fn ast(&self, pattern: &str) -> Result { + ast::parse::ParserBuilder::new() + .nest_limit(self.nest_limit) + .octal(self.octal) + .ignore_whitespace(self.ignore_whitespace) + .build() + .parse(pattern) + .map_err(Error::regex) + } +} + +/// A "configured" HIR expression, which is aware of the configuration which +/// produced this HIR. +/// +/// Since the configuration is tracked, values with this type can be +/// transformed into other HIR expressions (or regular expressions) in a way +/// that preserves the configuration. For example, the `fast_line_regex` +/// method will apply literal extraction to the inner HIR and use that to build +/// a new regex that matches the extracted literals in a way that is +/// consistent with the configuration that produced this HIR. For example, the +/// size limits set on the configured HIR will be propagated out to any +/// subsequently constructed HIR or regular expression. +#[derive(Clone, Debug)] +pub struct ConfiguredHIR { + original: String, + config: Config, + analysis: AstAnalysis, + expr: Hir, +} + +impl ConfiguredHIR { + /// Return the configuration for this HIR expression. + pub fn config(&self) -> &Config { + &self.config + } + + /// Compute the set of non-matching bytes for this HIR expression. + pub fn non_matching_bytes(&self) -> ByteSet { + non_matching_bytes(&self.expr) + } + + /// Builds a regular expression from this HIR expression. + pub fn regex(&self) -> Result { + self.pattern_to_regex(&self.expr.to_string()) + } + + /// Applies the given function to the concrete syntax of this HIR and then + /// generates a new HIR based on the result of the function in a way that + /// preserves the configuration. + /// + /// For example, this can be used to wrap a user provided regular + /// expression with additional semantics. e.g., See the `WordMatcher`. + pub fn with_pattern String>( + &self, + mut f: F, + ) -> Result + { + self.pattern_to_hir(&f(&self.expr.to_string())) + } + + /// If the current configuration has a line terminator set and if useful + /// literals could be extracted, then a regular expression matching those + /// literals is returned. If no line terminator is set, then `None` is + /// returned. + /// + /// If compiling the resulting regular expression failed, then an error + /// is returned. + /// + /// This method only returns something when a line terminator is set + /// because matches from this regex are generally candidates that must be + /// confirmed before reporting a match. When performing a line oriented + /// search, confirmation is easy: just extend the candidate match to its + /// respective line boundaries and then re-search that line for a full + /// match. This only works when the line terminator is set because the line + /// terminator setting guarantees that the regex itself can never match + /// through the line terminator byte. + pub fn fast_line_regex(&self) -> Result, Error> { + if self.config.line_terminator.is_none() { + return Ok(None); + } + match LiteralSets::new(&self.expr).one_regex() { + None => Ok(None), + Some(pattern) => self.pattern_to_regex(&pattern).map(Some), + } + } + + /// Create a regex from the given pattern using this HIR's configuration. + fn pattern_to_regex(&self, pattern: &str) -> Result { + // The settings we explicitly set here are intentionally a subset + // of the settings we have. The key point here is that our HIR + // expression is computed with the settings in mind, such that setting + // them here could actually lead to unintended behavior. For example, + // consider the pattern `(?U)a+`. This will get folded into the HIR + // as a non-greedy repetition operator which will in turn get printed + // to the concrete syntax as `a+?`, which is correct. But if we + // set the `swap_greed` option again, then we'll wind up with `(?U)a+?` + // which is equal to `a+` which is not the same as what we were given. + // + // We also don't need to apply `case_insensitive` since this gets + // folded into the HIR and would just cause us to do redundant work. + // + // Finally, we don't need to set `ignore_whitespace` since the concrete + // syntax emitted by the HIR printer never needs it. + // + // We set the rest of the options. Some of them are important, such as + // the size limit, and some of them are necessary to preserve the + // intention of the original pattern. For example, the Unicode flag + // will impact how the WordMatcher functions, namely, whether its + // word boundaries are Unicode aware or not. + RegexBuilder::new(&pattern) + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .size_limit(self.config.size_limit) + .dfa_size_limit(self.config.dfa_size_limit) + .build() + .map_err(Error::regex) + } + + /// Create an HIR expression from the given pattern using this HIR's + /// configuration. + fn pattern_to_hir(&self, pattern: &str) -> Result { + // See `pattern_to_regex` comment for explanation of why we only set + // a subset of knobs here. e.g., `swap_greed` is explicitly left out. + let expr = ::regex_syntax::ParserBuilder::new() + .nest_limit(self.config.nest_limit) + .octal(self.config.octal) + .allow_invalid_utf8(true) + .multi_line(self.config.multi_line) + .dot_matches_new_line(self.config.dot_matches_new_line) + .unicode(self.config.unicode) + .build() + .parse(pattern) + .map_err(Error::regex)?; + Ok(ConfiguredHIR { + original: self.original.clone(), + config: self.config.clone(), + analysis: self.analysis.clone(), + expr: expr, + }) + } +} diff --git a/grep-regex/src/crlf.rs b/grep-regex/src/crlf.rs new file mode 100644 index 000000000..ff6b15bfa --- /dev/null +++ b/grep-regex/src/crlf.rs @@ -0,0 +1,83 @@ +use regex_syntax::hir::{self, Hir, HirKind}; + +/// Substitutes all occurrences of multi-line enabled `$` with `(?:\r?$)`. +/// +/// This does not preserve the exact semantics of the given expression, +/// however, it does have the useful property that anything that matched the +/// given expression will also match the returned expression. The difference is +/// that the returned expression can match possibly other things as well. +/// +/// The principle reason why we do this is because the underlying regex engine +/// doesn't support CRLF aware `$` look-around. It's planned to fix it at that +/// level, but we perform this kludge in the mean time. +/// +/// Note that while the match preserving semantics are nice and neat, the +/// match position semantics are quite a bit messier. Namely, `$` only ever +/// matches the position between characters where as `\r??` can match a +/// character and change the offset. This is regretable, but works out pretty +/// nicely in most cases, especially when a match is limited to a single line. +pub fn crlfify(expr: Hir) -> Hir { + match expr.into_kind() { + HirKind::Anchor(hir::Anchor::EndLine) => { + let concat = Hir::concat(vec![ + Hir::repetition(hir::Repetition { + kind: hir::RepetitionKind::ZeroOrOne, + greedy: false, + hir: Box::new(Hir::literal(hir::Literal::Unicode('\r'))), + }), + Hir::anchor(hir::Anchor::EndLine), + ]); + Hir::group(hir::Group { + kind: hir::GroupKind::NonCapturing, + hir: Box::new(concat), + }) + } + HirKind::Empty => Hir::empty(), + HirKind::Literal(x) => Hir::literal(x), + HirKind::Class(x) => Hir::class(x), + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(crlfify(*x.hir)); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(crlfify(*x.hir)); + Hir::group(x) + } + HirKind::Concat(xs) => { + Hir::concat(xs.into_iter().map(crlfify).collect()) + } + HirKind::Alternation(xs) => { + Hir::alternation(xs.into_iter().map(crlfify).collect()) + } + } +} + +#[cfg(test)] +mod tests { + use regex_syntax::Parser; + use super::crlfify; + + fn roundtrip(pattern: &str) -> String { + let expr1 = Parser::new().parse(pattern).unwrap(); + let expr2 = crlfify(expr1); + expr2.to_string() + } + + #[test] + fn various() { + assert_eq!(roundtrip(r"(?m)$"), "(?:\r??(?m:$))"); + assert_eq!(roundtrip(r"(?m)$$"), "(?:\r??(?m:$))(?:\r??(?m:$))"); + assert_eq!( + roundtrip(r"(?m)(?:foo$|bar$)"), + "(?:foo(?:\r??(?m:$))|bar(?:\r??(?m:$)))" + ); + assert_eq!(roundtrip(r"(?m)$a"), "(?:\r??(?m:$))a"); + + // Not a multiline `$`, so no crlfifying occurs. + assert_eq!(roundtrip(r"$"), "\\z"); + // It's a literal, derp. + assert_eq!(roundtrip(r"\$"), "\\$"); + } +} diff --git a/grep-regex/src/error.rs b/grep-regex/src/error.rs new file mode 100644 index 000000000..f3bf08253 --- /dev/null +++ b/grep-regex/src/error.rs @@ -0,0 +1,88 @@ +use std::error; +use std::fmt; + +use util; + +/// An error that can occur in this crate. +/// +/// Generally, this error corresponds to problems building a regular +/// expression, whether it's in parsing, compilation or a problem with +/// guaranteeing a configured optimization. +#[derive(Clone, Debug)] +pub struct Error { + kind: ErrorKind, +} + +impl Error { + pub(crate) fn new(kind: ErrorKind) -> Error { + Error { kind } + } + + pub(crate) fn regex(err: E) -> Error { + Error { kind: ErrorKind::Regex(err.to_string()) } + } + + /// Return the kind of this error. + pub fn kind(&self) -> &ErrorKind { + &self.kind + } +} + +/// The kind of an error that can occur. +#[derive(Clone, Debug)] +pub enum ErrorKind { + /// An error that occurred as a result of parsing a regular expression. + /// This can be a syntax error or an error that results from attempting to + /// compile a regular expression that is too big. + /// + /// The string here is the underlying error converted to a string. + Regex(String), + /// An error that occurs when a building a regex that isn't permitted to + /// match a line terminator. In general, building the regex will do its + /// best to make matching a line terminator impossible (e.g., by removing + /// `\n` from the `\s` character class), but if the regex contains a + /// `\n` literal, then there is no reasonable choice that can be made and + /// therefore an error is reported. + /// + /// The string is the literal sequence found in the regex that is not + /// allowed. + NotAllowed(String), + /// This error occurs when a non-ASCII line terminator was provided. + /// + /// The invalid byte is included in this error. + InvalidLineTerminator(u8), + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl error::Error for Error { + fn description(&self) -> &str { + match self.kind { + ErrorKind::Regex(_) => "regex error", + ErrorKind::NotAllowed(_) => "literal not allowed", + ErrorKind::InvalidLineTerminator(_) => "invalid line terminator", + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self.kind { + ErrorKind::Regex(ref s) => write!(f, "{}", s), + ErrorKind::NotAllowed(ref lit) => { + write!(f, "the literal '{:?}' is not allowed in a regex", lit) + } + ErrorKind::InvalidLineTerminator(byte) => { + let x = util::show_bytes(&[byte]); + write!(f, "line terminators must be ASCII, but '{}' is not", x) + } + ErrorKind::__Nonexhaustive => unreachable!(), + } + } +} diff --git a/grep-regex/src/lib.rs b/grep-regex/src/lib.rs new file mode 100644 index 000000000..a578d0fcb --- /dev/null +++ b/grep-regex/src/lib.rs @@ -0,0 +1,27 @@ +/*! +An implementation of `grep-matcher`'s `Matcher` trait for Rust's regex engine. +*/ + +#![deny(missing_docs)] + +extern crate grep_matcher; +#[macro_use] +extern crate log; +extern crate regex; +extern crate regex_syntax; +extern crate thread_local; +extern crate utf8_ranges; + +pub use error::{Error, ErrorKind}; +pub use matcher::{RegexCaptures, RegexMatcher, RegexMatcherBuilder}; + +mod ast; +mod config; +mod crlf; +mod error; +mod literal; +mod matcher; +mod non_matching; +mod strip; +mod util; +mod word; diff --git a/grep-regex/src/literal.rs b/grep-regex/src/literal.rs new file mode 100644 index 000000000..c3960ae73 --- /dev/null +++ b/grep-regex/src/literal.rs @@ -0,0 +1,304 @@ +/* +This module is responsible for extracting *inner* literals out of the AST of a +regular expression. Normally this is the job of the regex engine itself, but +the regex engine doesn't look for inner literals. Since we're doing line based +searching, we can use them, so we need to do it ourselves. +*/ + +use std::cmp; + +use regex_syntax::hir::{self, Hir, HirKind}; +use regex_syntax::hir::literal::{Literal, Literals}; + +use util; + +/// Represents prefix, suffix and inner "required" literals for a regular +/// expression. +/// +/// Prefixes and suffixes are detected using regex-syntax. The inner required +/// literals are detected using something custom (but based on the code in +/// regex-syntax). +#[derive(Clone, Debug)] +pub struct LiteralSets { + /// A set of prefix literals. + prefixes: Literals, + /// A set of suffix literals. + suffixes: Literals, + /// A set of literals such that at least one of them must appear in every + /// match. A literal in this set may be neither a prefix nor a suffix. + required: Literals, +} + +impl LiteralSets { + /// Create a set of literals from the given HIR expression. + pub fn new(expr: &Hir) -> LiteralSets { + let mut required = Literals::empty(); + union_required(expr, &mut required); + LiteralSets { + prefixes: Literals::prefixes(expr), + suffixes: Literals::suffixes(expr), + required: required, + } + } + + /// If it is deemed advantageuous to do so (via various suspicious + /// heuristics), this will return a single regular expression pattern that + /// matches a subset of the language matched by the regular expression that + /// generated these literal sets. The idea here is that the pattern + /// returned by this method is much cheaper to search for. i.e., It is + /// usually a single literal or an alternation of literals. + pub fn one_regex(&self) -> Option { + // TODO: The logic in this function is basically inscrutable. It grew + // organically in the old grep 0.1 crate. Ideally, it would be + // re-worked. In fact, the entire inner literal extraction should be + // re-worked. Actually, most of regex-syntax's literal extraction + // should also be re-worked. Alas... only so much time in the day. + + if self.prefixes.all_complete() && !self.prefixes.is_empty() { + debug!("literal prefixes detected: {:?}", self.prefixes); + // When this is true, the regex engine will do a literal scan, + // so we don't need to return anything. + return None; + } + + // Out of inner required literals, prefixes and suffixes, which one + // is the longest? We pick the longest to do fast literal scan under + // the assumption that a longer literal will have a lower false + // positive rate. + let pre_lcp = self.prefixes.longest_common_prefix(); + let pre_lcs = self.prefixes.longest_common_suffix(); + let suf_lcp = self.suffixes.longest_common_prefix(); + let suf_lcs = self.suffixes.longest_common_suffix(); + + let req_lits = self.required.literals(); + let req = match req_lits.iter().max_by_key(|lit| lit.len()) { + None => &[], + Some(req) => &***req, + }; + + let mut lit = pre_lcp; + if pre_lcs.len() > lit.len() { + lit = pre_lcs; + } + if suf_lcp.len() > lit.len() { + lit = suf_lcp; + } + if suf_lcs.len() > lit.len() { + lit = suf_lcs; + } + if req_lits.len() == 1 && req.len() > lit.len() { + lit = req; + } + + // Special case: if we detected an alternation of inner required + // literals and its longest literal is bigger than the longest + // prefix/suffix, then choose the alternation. In practice, this + // helps with case insensitive matching, which can generate lots of + // inner required literals. + let any_empty = req_lits.iter().any(|lit| lit.is_empty()); + if req.len() > lit.len() && req_lits.len() > 1 && !any_empty { + debug!("required literals found: {:?}", req_lits); + let alts: Vec = req_lits + .into_iter() + .map(|x| util::bytes_to_regex(x)) + .collect(); + // We're matching raw bytes, so disable Unicode mode. + Some(format!("(?-u:{})", alts.join("|"))) + } else if lit.is_empty() { + None + } else { + debug!("required literal found: {:?}", util::show_bytes(lit)); + Some(format!("(?-u:{})", util::bytes_to_regex(&lit))) + } + } +} + +fn union_required(expr: &Hir, lits: &mut Literals) { + match *expr.kind() { + HirKind::Literal(hir::Literal::Unicode(c)) => { + let mut buf = [0u8; 4]; + lits.cross_add(c.encode_utf8(&mut buf).as_bytes()); + } + HirKind::Literal(hir::Literal::Byte(b)) => { + lits.cross_add(&[b]); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + if count_unicode_class(cls) >= 5 || !lits.add_char_class(cls) { + lits.cut(); + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + if count_byte_class(cls) >= 5 || !lits.add_byte_class(cls) { + lits.cut(); + } + } + HirKind::Group(hir::Group { ref hir, .. }) => { + union_required(&**hir, lits); + } + HirKind::Repetition(ref x) => { + match x.kind { + hir::RepetitionKind::ZeroOrOne => lits.cut(), + hir::RepetitionKind::ZeroOrMore => lits.cut(), + hir::RepetitionKind::OneOrMore => { + union_required(&x.hir, lits); + lits.cut(); + } + hir::RepetitionKind::Range(ref rng) => { + let (min, max) = match *rng { + hir::RepetitionRange::Exactly(m) => (m, Some(m)), + hir::RepetitionRange::AtLeast(m) => (m, None), + hir::RepetitionRange::Bounded(m, n) => (m, Some(n)), + }; + repeat_range_literals( + &x.hir, min, max, x.greedy, lits, union_required); + } + } + } + HirKind::Concat(ref es) if es.is_empty() => {} + HirKind::Concat(ref es) if es.len() == 1 => { + union_required(&es[0], lits) + } + HirKind::Concat(ref es) => { + for e in es { + let mut lits2 = lits.to_empty(); + union_required(e, &mut lits2); + if lits2.is_empty() { + lits.cut(); + continue; + } + if lits2.contains_empty() { + lits.cut(); + } + if !lits.cross_product(&lits2) { + // If this expression couldn't yield any literal that + // could be extended, then we need to quit. Since we're + // short-circuiting, we also need to freeze every member. + lits.cut(); + break; + } + } + } + HirKind::Alternation(ref es) => { + alternate_literals(es, lits, union_required); + } + _ => lits.cut(), + } +} + +fn repeat_range_literals( + e: &Hir, + min: u32, + max: Option, + _greedy: bool, + lits: &mut Literals, + mut f: F, +) { + if min == 0 { + // This is a bit conservative. If `max` is set, then we could + // treat this as a finite set of alternations. For now, we + // just treat it as `e*`. + lits.cut(); + } else { + let n = cmp::min(lits.limit_size(), min as usize); + // We only extract literals from a single repetition, even though + // we could do more. e.g., `a{3}` will have `a` extracted instead of + // `aaa`. The reason is that inner literal extraction can't be unioned + // across repetitions. e.g., extracting `foofoofoo` from `(\w+foo){3}` + // is wrong. + f(e, lits); + if n < min as usize { + lits.cut(); + } + if max.map_or(true, |max| min < max) { + lits.cut(); + } + } +} + +fn alternate_literals( + es: &[Hir], + lits: &mut Literals, + mut f: F, +) { + let mut lits2 = lits.to_empty(); + for e in es { + let mut lits3 = lits.to_empty(); + lits3.set_limit_size(lits.limit_size() / 5); + f(e, &mut lits3); + if lits3.is_empty() || !lits2.union(lits3) { + // If we couldn't find suffixes for *any* of the + // alternates, then the entire alternation has to be thrown + // away and any existing members must be frozen. Similarly, + // if the union couldn't complete, stop and freeze. + lits.cut(); + return; + } + } + // All we do at the moment is look for prefixes and suffixes. If both + // are empty, then we report nothing. We should be able to do better than + // this, but we'll need something more expressive than just a "set of + // literals." + let lcp = lits2.longest_common_prefix(); + let lcs = lits2.longest_common_suffix(); + if !lcp.is_empty() { + lits.cross_add(lcp); + } + lits.cut(); + if !lcs.is_empty() { + lits.add(Literal::empty()); + lits.add(Literal::new(lcs.to_vec())); + } +} + +/// Return the number of characters in the given class. +fn count_unicode_class(cls: &hir::ClassUnicode) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +/// Return the number of bytes in the given class. +fn count_byte_class(cls: &hir::ClassBytes) -> u32 { + cls.iter().map(|r| 1 + (r.end() as u32 - r.start() as u32)).sum() +} + +#[cfg(test)] +mod tests { + use regex_syntax::Parser; + use super::LiteralSets; + + fn sets(pattern: &str) -> LiteralSets { + let hir = Parser::new().parse(pattern).unwrap(); + LiteralSets::new(&hir) + } + + fn one_regex(pattern: &str) -> Option { + sets(pattern).one_regex() + } + + // Put a pattern into the same format as the one returned by `one_regex`. + fn pat(pattern: &str) -> Option { + Some(format!("(?-u:{})", pattern)) + } + + #[test] + fn various() { + // Obviously no literals. + assert!(one_regex(r"\w").is_none()); + assert!(one_regex(r"\pL").is_none()); + + // Tantalizingly close. + assert!(one_regex(r"\w|foo").is_none()); + + // There's a literal, but it's better if the regex engine handles it + // internally. + assert!(one_regex(r"abc").is_none()); + + // Core use cases. + assert_eq!(one_regex(r"\wabc\w"), pat("abc")); + assert_eq!(one_regex(r"abc\w"), pat("abc")); + + // TODO: Make these pass. We're missing some potentially big wins + // without these. + // assert_eq!(one_regex(r"\w(foo|bar|baz)"), pat("foo|bar|baz")); + // assert_eq!(one_regex(r"\w(foo|bar|baz)\w"), pat("foo|bar|baz")); + } +} diff --git a/grep-regex/src/matcher.rs b/grep-regex/src/matcher.rs new file mode 100644 index 000000000..5589508c1 --- /dev/null +++ b/grep-regex/src/matcher.rs @@ -0,0 +1,864 @@ +use std::collections::HashMap; + +use grep_matcher::{ + Captures, LineMatchKind, LineTerminator, Match, Matcher, NoError, ByteSet, +}; +use regex::bytes::{CaptureLocations, Regex}; + +use config::{Config, ConfiguredHIR}; +use error::Error; +use word::WordMatcher; + +/// A builder for constructing a `Matcher` using regular expressions. +/// +/// This builder re-exports many of the same options found on the regex crate's +/// builder, in addition to a few other options such as smart case, word +/// matching and the ability to set a line terminator which may enable certain +/// types of optimizations. +/// +/// The syntax supported is documented as part of the regex crate: +/// https://docs.rs/regex/*/regex/#syntax +#[derive(Clone, Debug)] +pub struct RegexMatcherBuilder { + config: Config, +} + +impl Default for RegexMatcherBuilder { + fn default() -> RegexMatcherBuilder { + RegexMatcherBuilder::new() + } +} + +impl RegexMatcherBuilder { + /// Create a new builder for configuring a regex matcher. + pub fn new() -> RegexMatcherBuilder { + RegexMatcherBuilder { + config: Config::default(), + } + } + + /// Build a new matcher using the current configuration for the provided + /// pattern. + /// + /// The syntax supported is documented as part of the regex crate: + /// https://docs.rs/regex/*/regex/#syntax + pub fn build(&self, pattern: &str) -> Result { + let chir = self.config.hir(pattern)?; + let fast_line_regex = chir.fast_line_regex()?; + let non_matching_bytes = chir.non_matching_bytes(); + if let Some(ref re) = fast_line_regex { + trace!("extracted fast line regex: {:?}", re); + } + Ok(RegexMatcher { + config: self.config.clone(), + matcher: RegexMatcherImpl::new(&chir)?, + fast_line_regex: fast_line_regex, + non_matching_bytes: non_matching_bytes, + }) + } + + /// Set the value for the case insensitive (`i`) flag. + /// + /// When enabled, letters in the pattern will match both upper case and + /// lower case variants. + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.case_insensitive = yes; + self + } + + /// Whether to enable "smart case" or not. + /// + /// When smart case is enabled, the builder will automatically enable + /// case insensitive matching based on how the pattern is written. Namely, + /// case insensitive mode is enabled when both of the following things + /// are true: + /// + /// 1. The pattern contains at least one literal character. For example, + /// `a\w` contains a literal (`a`) but `\w` does not. + /// 2. Of the literals in the pattern, none of them are considered to be + /// uppercase according to Unicode. For example, `foo\pL` has no + /// uppercase literals but `Foo\pL` does. + pub fn case_smart(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.case_smart = yes; + self + } + + /// Set the value for the multi-line matching (`m`) flag. + /// + /// When enabled, `^` matches the beginning of lines and `$` matches the + /// end of lines. + /// + /// By default, they match beginning/end of the input. + pub fn multi_line(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.multi_line = yes; + self + } + + /// Set the value for the any character (`s`) flag, where in `.` matches + /// anything when `s` is set and matches anything except for new line when + /// it is not set (the default). + /// + /// N.B. "matches anything" means "any byte" when Unicode is disabled and + /// means "any valid UTF-8 encoding of any Unicode scalar value" when + /// Unicode is enabled. + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexMatcherBuilder { + self.config.dot_matches_new_line = yes; + self + } + + /// Set the value for the greedy swap (`U`) flag. + /// + /// When enabled, a pattern like `a*` is lazy (tries to find shortest + /// match) and `a*?` is greedy (tries to find longest match). + /// + /// By default, `a*` is greedy and `a*?` is lazy. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.swap_greed = yes; + self + } + + /// Set the value for the ignore whitespace (`x`) flag. + /// + /// When enabled, whitespace such as new lines and spaces will be ignored + /// between expressions of the pattern, and `#` can be used to start a + /// comment until the next new line. + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexMatcherBuilder { + self.config.ignore_whitespace = yes; + self + } + + /// Set the value for the Unicode (`u`) flag. + /// + /// Enabled by default. When disabled, character classes such as `\w` only + /// match ASCII word characters instead of all Unicode word characters. + pub fn unicode(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.unicode = yes; + self + } + + /// Whether to support octal syntax or not. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints in + /// a regular expression. For example, `a`, `\x61`, `\u0061` and + /// `\141` are all equivalent regular expressions, where the last example + /// shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, it does + /// make good error messages harder. That is, in PCRE based regex engines, + /// syntax like `\0` invokes a backreference, which is explicitly + /// unsupported in Rust's regex engine. However, many users expect it to + /// be supported. Therefore, when octal support is disabled, the error + /// message will explicitly mention that backreferences aren't supported. + /// + /// Octal syntax is disabled by default. + pub fn octal(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.octal = yes; + self + } + + /// Set the approximate size limit of the compiled regular expression. + /// + /// This roughly corresponds to the number of bytes occupied by a single + /// compiled program. If the program exceeds this number, then a + /// compilation error is returned. + pub fn size_limit(&mut self, bytes: usize) -> &mut RegexMatcherBuilder { + self.config.size_limit = bytes; + self + } + + /// Set the approximate size of the cache used by the DFA. + /// + /// This roughly corresponds to the number of bytes that the DFA will + /// use while searching. + /// + /// Note that this is a *per thread* limit. There is no way to set a global + /// limit. In particular, if a regex is used from multiple threads + /// simultaneously, then each thread may use up to the number of bytes + /// specified here. + pub fn dfa_size_limit( + &mut self, + bytes: usize, + ) -> &mut RegexMatcherBuilder { + self.config.dfa_size_limit = bytes; + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an `Ast` using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire Ast is parsed. Therefore, + /// if callers want to put a limit on the amount of heap space used, then + /// they should impose a limit on the length, in bytes, of the concrete + /// pattern string. In particular, this is viable since this parser + /// implementation will limit itself to heap space proportional to the + /// lenth of the pattern string. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires a concatenation, which results in a nest + /// depth of `1`. In general, a nest limit is not something that manifests + /// in an obvious way in the concrete syntax, therefore, it should not be + /// used in a granular way. + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexMatcherBuilder { + self.config.nest_limit = limit; + self + } + + /// Set an ASCII line terminator for the matcher. + /// + /// The purpose of setting a line terminator is to enable a certain class + /// of optimizations that can make line oriented searching faster. Namely, + /// when a line terminator is enabled, then the builder will guarantee that + /// the resulting matcher will never be capable of producing a match that + /// contains the line terminator. Because of this guarantee, users of the + /// resulting matcher do not need to slowly execute a search line by line + /// for line oriented search. + /// + /// If the aforementioned guarantee about not matching a line terminator + /// cannot be made because of how the pattern was written, then the builder + /// will return an error when attempting to construct the matcher. For + /// example, the pattern `a\sb` will be transformed such that it can never + /// match `a\nb` (when `\n` is the line terminator), but the pattern `a\nb` + /// will result in an error since the `\n` cannot be easily removed without + /// changing the fundamental intent of the pattern. + /// + /// If the given line terminator isn't an ASCII byte (`<=127`), then the + /// builder will return an error when constructing the matcher. + pub fn line_terminator( + &mut self, + line_term: Option, + ) -> &mut RegexMatcherBuilder { + self.config.line_terminator = line_term.map(LineTerminator::byte); + self + } + + /// Set the line terminator to `\r\n` and enable CRLF matching for `$` in + /// regex patterns. + /// + /// This method sets two distinct settings: + /// + /// 1. It causes the line terminator for the matcher to be `\r\n`. Namely, + /// this prevents the matcher from ever producing a match that contains + /// a `\r` or `\n`. + /// 2. It translates all instances of `$` in the pattern to `(?:\r??$)`. + /// This works around the fact that the regex engine does not support + /// matching CRLF as a line terminator when using `$`. + /// + /// In particular, because of (2), the matches produced by the matcher may + /// be slightly different than what one would expect given the pattern. + /// This is the trade off made: in many cases, `$` will "just work" in the + /// presence of `\r\n` line terminators, but matches may require some + /// trimming to faithfully represent the intended match. + /// + /// Note that if you do not wish to set the line terminator but would still + /// like `$` to match `\r\n` line terminators, then it is valid to call + /// `crlf(true)` followed by `line_terminator(None)`. Ordering is + /// important, since `crlf` and `line_terminator` override each other. + pub fn crlf(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + if yes { + self.config.line_terminator = Some(LineTerminator::crlf()); + } else { + self.config.line_terminator = None; + } + self.config.crlf = yes; + self + } + + /// Require that all matches occur on word boundaries. + /// + /// Enabling this option is subtly different than putting `\b` assertions + /// on both sides of your pattern. In particular, a `\b` assertion requires + /// that one side of it match a word character while the other match a + /// non-word character. This option, in contrast, merely requires that + /// one side match a non-word character. + /// + /// For example, `\b-2\b` will not match `foo -2 bar` since `-` is not a + /// word character. However, `-2` with this `word` option enabled will + /// match the `-2` in `foo -2 bar`. + pub fn word(&mut self, yes: bool) -> &mut RegexMatcherBuilder { + self.config.word = yes; + self + } +} + +/// An implementation of the `Matcher` trait using Rust's standard regex +/// library. +#[derive(Clone, Debug)] +pub struct RegexMatcher { + /// The configuration specified by the caller. + config: Config, + /// The underlying matcher implementation. + matcher: RegexMatcherImpl, + /// A regex that never reports false negatives but may report false + /// positives that is believed to be capable of being matched more quickly + /// than `regex`. Typically, this is a single literal or an alternation + /// of literals. + fast_line_regex: Option, + /// A set of bytes that will never appear in a match. + non_matching_bytes: ByteSet, +} + +impl RegexMatcher { + /// Create a new matcher from the given pattern using the default + /// configuration. + pub fn new(pattern: &str) -> Result { + RegexMatcherBuilder::new().build(pattern) + } + + /// Create a new matcher from the given pattern using the default + /// configuration, but matches lines terminated by `\n`. + /// + /// This returns an error if the given pattern contains a literal `\n`. + /// Other uses of `\n` (such as in `\s`) are removed transparently. + pub fn new_line_matcher(pattern: &str) -> Result { + RegexMatcherBuilder::new() + .line_terminator(Some(b'\n')) + .build(pattern) + } +} + +/// An encapsulation of the type of matcher we use in `RegexMatcher`. +#[derive(Clone, Debug)] +enum RegexMatcherImpl { + /// The standard matcher used for all regular expressions. + Standard(StandardMatcher), + /// A matcher that only matches at word boundaries. This transforms the + /// regex to `(^|\W)(...)($|\W)` instead of the more intuitive `\b(...)\b`. + /// Because of this, the WordMatcher provides its own implementation of + /// `Matcher` to encapsulate its use of capture groups to make them + /// invisible to the caller. + Word(WordMatcher), +} + +impl RegexMatcherImpl { + /// Based on the configuration, create a new implementation of the + /// `Matcher` trait. + fn new(expr: &ConfiguredHIR) -> Result { + if expr.config().word { + Ok(RegexMatcherImpl::Word(WordMatcher::new(expr)?)) + } else { + Ok(RegexMatcherImpl::Standard(StandardMatcher::new(expr)?)) + } + } +} + +// This implementation just dispatches on the internal matcher impl except +// for the line terminator optimization, which is possibly executed via +// `fast_line_regex`. +impl Matcher for RegexMatcher { + type Captures = RegexCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find_at(haystack, at), + Word(ref m) => m.find_at(haystack, at), + } + } + + fn new_captures(&self) -> Result { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.new_captures(), + Word(ref m) => m.new_captures(), + } + } + + fn capture_count(&self) -> usize { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.capture_count(), + Word(ref m) => m.capture_count(), + } + } + + fn capture_index(&self, name: &str) -> Option { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.capture_index(name), + Word(ref m) => m.capture_index(name), + } + } + + fn find(&self, haystack: &[u8]) -> Result, NoError> { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find(haystack), + Word(ref m) => m.find(haystack), + } + } + + fn find_iter( + &self, + haystack: &[u8], + matched: F, + ) -> Result<(), NoError> + where F: FnMut(Match) -> bool + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.find_iter(haystack, matched), + Word(ref m) => m.find_iter(haystack, matched), + } + } + + fn try_find_iter( + &self, + haystack: &[u8], + matched: F, + ) -> Result, NoError> + where F: FnMut(Match) -> Result + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.try_find_iter(haystack, matched), + Word(ref m) => m.try_find_iter(haystack, matched), + } + } + + fn captures( + &self, + haystack: &[u8], + caps: &mut RegexCaptures, + ) -> Result { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures(haystack, caps), + Word(ref m) => m.captures(haystack, caps), + } + } + + fn captures_iter( + &self, + haystack: &[u8], + caps: &mut RegexCaptures, + matched: F, + ) -> Result<(), NoError> + where F: FnMut(&RegexCaptures) -> bool + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures_iter(haystack, caps, matched), + Word(ref m) => m.captures_iter(haystack, caps, matched), + } + } + + fn try_captures_iter( + &self, + haystack: &[u8], + caps: &mut RegexCaptures, + matched: F, + ) -> Result, NoError> + where F: FnMut(&RegexCaptures) -> Result + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.try_captures_iter(haystack, caps, matched), + Word(ref m) => m.try_captures_iter(haystack, caps, matched), + } + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.captures_at(haystack, at, caps), + Word(ref m) => m.captures_at(haystack, at, caps), + } + } + + fn replace( + &self, + haystack: &[u8], + dst: &mut Vec, + append: F, + ) -> Result<(), NoError> + where F: FnMut(Match, &mut Vec) -> bool + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.replace(haystack, dst, append), + Word(ref m) => m.replace(haystack, dst, append), + } + } + + fn replace_with_captures( + &self, + haystack: &[u8], + caps: &mut RegexCaptures, + dst: &mut Vec, + append: F, + ) -> Result<(), NoError> + where F: FnMut(&Self::Captures, &mut Vec) -> bool + { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } + Word(ref m) => { + m.replace_with_captures(haystack, caps, dst, append) + } + } + } + + fn is_match(&self, haystack: &[u8]) -> Result { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.is_match(haystack), + Word(ref m) => m.is_match(haystack), + } + } + + fn is_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.is_match_at(haystack, at), + Word(ref m) => m.is_match_at(haystack, at), + } + } + + fn shortest_match( + &self, + haystack: &[u8], + ) -> Result, NoError> { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.shortest_match(haystack), + Word(ref m) => m.shortest_match(haystack), + } + } + + fn shortest_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + use self::RegexMatcherImpl::*; + match self.matcher { + Standard(ref m) => m.shortest_match_at(haystack, at), + Word(ref m) => m.shortest_match_at(haystack, at), + } + } + + fn non_matching_bytes(&self) -> Option<&ByteSet> { + Some(&self.non_matching_bytes) + } + + fn line_terminator(&self) -> Option { + self.config.line_terminator + } + + fn find_candidate_line( + &self, + haystack: &[u8], + ) -> Result, NoError> { + Ok(match self.fast_line_regex { + Some(ref regex) => { + regex.shortest_match(haystack).map(LineMatchKind::Candidate) + } + None => { + self.shortest_match(haystack)?.map(LineMatchKind::Confirmed) + } + }) + } +} + +/// The implementation of the standard regex matcher. +#[derive(Clone, Debug)] +struct StandardMatcher { + /// The regular expression compiled from the pattern provided by the + /// caller. + regex: Regex, + /// A map from capture group name to its corresponding index. + names: HashMap, +} + +impl StandardMatcher { + fn new(expr: &ConfiguredHIR) -> Result { + let regex = expr.regex()?; + let mut names = HashMap::new(); + for (i, optional_name) in regex.capture_names().enumerate() { + if let Some(name) = optional_name { + names.insert(name.to_string(), i); + } + } + Ok(StandardMatcher { regex, names }) + } +} + +impl Matcher for StandardMatcher { + type Captures = RegexCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + Ok(self.regex + .find_at(haystack, at) + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result { + Ok(RegexCaptures::new(self.regex.capture_locations())) + } + + fn capture_count(&self) -> usize { + self.regex.captures_len() + } + + fn capture_index(&self, name: &str) -> Option { + self.names.get(name).map(|i| *i) + } + + fn try_find_iter( + &self, + haystack: &[u8], + mut matched: F, + ) -> Result, NoError> + where F: FnMut(Match) -> Result + { + for m in self.regex.find_iter(haystack) { + match matched(Match::new(m.start(), m.end())) { + Ok(true) => continue, + Ok(false) => return Ok(Ok(())), + Err(err) => return Ok(Err(err)), + } + } + Ok(Ok(())) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + Ok(self.regex.captures_read_at(&mut caps.locs, haystack, at).is_some()) + } + + fn shortest_match_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + Ok(self.regex.shortest_match_at(haystack, at)) + } +} + +/// Represents the match offsets of each capturing group in a match. +/// +/// The first, or `0`th capture group, always corresponds to the entire match +/// and is guaranteed to be present when a match occurs. The next capture +/// group, at index `1`, corresponds to the first capturing group in the regex, +/// ordered by the position at which the left opening parenthesis occurs. +/// +/// Note that not all capturing groups are guaranteed to be present in a match. +/// For example, in the regex, `(?P\w)|(?P\W)`, only one of `foo` +/// or `bar` will ever be set in any given match. +/// +/// In order to access a capture group by name, you'll need to first find the +/// index of the group using the corresponding matcher's `capture_index` +/// method, and then use that index with `RegexCaptures::get`. +#[derive(Clone, Debug)] +pub struct RegexCaptures { + /// Where the locations are stored. + locs: CaptureLocations, + /// These captures behave as if the capturing groups begin at the given + /// offset. When set to `0`, this has no affect and capture groups are + /// indexed like normal. + /// + /// This is useful when building matchers that wrap arbitrary regular + /// expressions. For example, `WordMatcher` takes an existing regex `re` + /// and creates `(?:^|\W)(re)(?:$|\W)`, but hides the fact that the regex + /// has been wrapped from the caller. In order to do this, the matcher + /// and the capturing groups must behave as if `(re)` is the `0`th capture + /// group. + offset: usize, +} + +impl Captures for RegexCaptures { + fn len(&self) -> usize { + self.locs.len().checked_sub(self.offset).unwrap() + } + + fn get(&self, i: usize) -> Option { + let actual = i.checked_add(self.offset).unwrap(); + self.locs.pos(actual).map(|(s, e)| Match::new(s, e)) + } +} + +impl RegexCaptures { + pub(crate) fn new(locs: CaptureLocations) -> RegexCaptures { + RegexCaptures::with_offset(locs, 0) + } + + pub(crate) fn with_offset( + locs: CaptureLocations, + offset: usize, + ) -> RegexCaptures { + RegexCaptures { locs, offset } + } + + pub(crate) fn locations(&mut self) -> &mut CaptureLocations { + &mut self.locs + } +} + +#[cfg(test)] +mod tests { + use grep_matcher::{LineMatchKind, Matcher}; + use super::*; + + // Test that enabling word matches does the right thing and demonstrate + // the difference between it and surrounding the regex in `\b`. + #[test] + fn word() { + let matcher = RegexMatcherBuilder::new() + .word(true) + .build(r"-2") + .unwrap(); + assert!(matcher.is_match(b"abc -2 foo").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .word(false) + .build(r"\b-2\b") + .unwrap(); + assert!(!matcher.is_match(b"abc -2 foo").unwrap()); + } + + // Test that enabling a line terminator prevents it from matching through + // said line terminator. + #[test] + fn line_terminator() { + // This works, because there's no line terminator specified. + let matcher = RegexMatcherBuilder::new() + .build(r"abc\sxyz") + .unwrap(); + assert!(matcher.is_match(b"abc\nxyz").unwrap()); + + // This doesn't. + let matcher = RegexMatcherBuilder::new() + .line_terminator(Some(b'\n')) + .build(r"abc\sxyz") + .unwrap(); + assert!(!matcher.is_match(b"abc\nxyz").unwrap()); + } + + // Ensure that the builder returns an error if a line terminator is set + // and the regex could not be modified to remove a line terminator. + #[test] + fn line_terminator_error() { + assert!(RegexMatcherBuilder::new() + .line_terminator(Some(b'\n')) + .build(r"a\nz") + .is_err()) + } + + // Test that enabling CRLF permits `$` to match at the end of a line. + #[test] + fn line_terminator_crlf() { + // Test normal use of `$` with a `\n` line terminator. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\n").unwrap()); + + // Test that `$` doesn't match at `\r\n` boundary normally. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .build(r"abc$") + .unwrap(); + assert!(!matcher.is_match(b"abc\r\n").unwrap()); + + // Now check the CRLF handling. + let matcher = RegexMatcherBuilder::new() + .multi_line(true) + .crlf(true) + .build(r"abc$") + .unwrap(); + assert!(matcher.is_match(b"abc\r\n").unwrap()); + } + + // Test that smart case works. + #[test] + fn case_smart() { + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"abc") + .unwrap(); + assert!(matcher.is_match(b"ABC").unwrap()); + + let matcher = RegexMatcherBuilder::new() + .case_smart(true) + .build(r"aBc") + .unwrap(); + assert!(!matcher.is_match(b"ABC").unwrap()); + } + + // Test that finding candidate lines works as expected. + #[test] + fn candidate_lines() { + fn is_confirmed(m: LineMatchKind) -> bool { + match m { + LineMatchKind::Confirmed(_) => true, + _ => false, + } + } + fn is_candidate(m: LineMatchKind) -> bool { + match m { + LineMatchKind::Candidate(_) => true, + _ => false, + } + } + + // With no line terminator set, we can't employ any optimizations, + // so we get a confirmed match. + let matcher = RegexMatcherBuilder::new() + .build(r"\wfoo\s") + .unwrap(); + let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); + assert!(is_confirmed(m)); + + // With a line terminator and a regex specially crafted to have an + // easy-to-detect inner literal, we can apply an optimization that + // quickly finds candidate matches. + let matcher = RegexMatcherBuilder::new() + .line_terminator(Some(b'\n')) + .build(r"\wfoo\s") + .unwrap(); + let m = matcher.find_candidate_line(b"afoo ").unwrap().unwrap(); + assert!(is_candidate(m)); + } +} diff --git a/grep-regex/src/non_matching.rs b/grep-regex/src/non_matching.rs new file mode 100644 index 000000000..5c44fa9b5 --- /dev/null +++ b/grep-regex/src/non_matching.rs @@ -0,0 +1,128 @@ +use grep_matcher::ByteSet; +use regex_syntax::hir::{self, Hir, HirKind}; +use utf8_ranges::Utf8Sequences; + +/// Return a confirmed set of non-matching bytes from the given expression. +pub fn non_matching_bytes(expr: &Hir) -> ByteSet { + let mut set = ByteSet::full(); + remove_matching_bytes(expr, &mut set); + set +} + +/// Remove any bytes from the given set that can occur in a matched produced by +/// the given expression. +fn remove_matching_bytes( + expr: &Hir, + set: &mut ByteSet, +) { + match *expr.kind() { + HirKind::Empty + | HirKind::Anchor(_) + | HirKind::WordBoundary(_) => {} + HirKind::Literal(hir::Literal::Unicode(c)) => { + for &b in c.encode_utf8(&mut [0; 4]).as_bytes() { + set.remove(b); + } + } + HirKind::Literal(hir::Literal::Byte(b)) => { + set.remove(b); + } + HirKind::Class(hir::Class::Unicode(ref cls)) => { + for range in cls.iter() { + // This is presumably faster than encoding every codepoint + // to UTF-8 and then removing those bytes from the set. + for seq in Utf8Sequences::new(range.start(), range.end()) { + for byte_range in seq.as_slice() { + set.remove_all(byte_range.start, byte_range.end); + } + } + } + } + HirKind::Class(hir::Class::Bytes(ref cls)) => { + for range in cls.iter() { + set.remove_all(range.start(), range.end()); + } + } + HirKind::Repetition(ref x) => { + remove_matching_bytes(&x.hir, set); + } + HirKind::Group(ref x) => { + remove_matching_bytes(&x.hir, set); + } + HirKind::Concat(ref xs) => { + for x in xs { + remove_matching_bytes(x, set); + } + } + HirKind::Alternation(ref xs) => { + for x in xs { + remove_matching_bytes(x, set); + } + } + } +} + +#[cfg(test)] +mod tests { + use grep_matcher::ByteSet; + use regex_syntax::ParserBuilder; + + use super::non_matching_bytes; + + fn extract(pattern: &str) -> ByteSet { + let expr = ParserBuilder::new() + .allow_invalid_utf8(true) + .build() + .parse(pattern) + .unwrap(); + non_matching_bytes(&expr) + } + + fn sparse(set: &ByteSet) -> Vec { + let mut sparse_set = vec![]; + for b in (0..256).map(|b| b as u8) { + if set.contains(b) { + sparse_set.push(b); + } + } + sparse_set + } + + fn sparse_except(except: &[u8]) -> Vec { + let mut except_set = vec![false; 256]; + for &b in except { + except_set[b as usize] = true; + } + + let mut set = vec![]; + for b in (0..256).map(|b| b as u8) { + if !except_set[b as usize] { + set.push(b); + } + } + set + } + + #[test] + fn dot() { + assert_eq!(sparse(&extract(".")), vec![ + b'\n', + 192, 193, 245, 246, 247, 248, 249, + 250, 251, 252, 253, 254, 255, + ]); + assert_eq!(sparse(&extract("(?s).")), vec![ + 192, 193, 245, 246, 247, 248, 249, + 250, 251, 252, 253, 254, 255, + ]); + assert_eq!(sparse(&extract("(?-u).")), vec![b'\n']); + assert_eq!(sparse(&extract("(?s-u).")), vec![]); + } + + #[test] + fn literal() { + assert_eq!(sparse(&extract("a")), sparse_except(&[b'a'])); + assert_eq!(sparse(&extract("☃")), sparse_except(&[0xE2, 0x98, 0x83])); + assert_eq!(sparse(&extract(r"\xFF")), sparse_except(&[0xC3, 0xBF])); + assert_eq!(sparse(&extract(r"(?-u)\xFF")), sparse_except(&[0xFF])); + } +} diff --git a/grep-regex/src/strip.rs b/grep-regex/src/strip.rs new file mode 100644 index 000000000..6cf3e47f9 --- /dev/null +++ b/grep-regex/src/strip.rs @@ -0,0 +1,154 @@ +use grep_matcher::LineTerminator; +use regex_syntax::hir::{self, Hir, HirKind}; + +use error::{Error, ErrorKind}; + +/// Return an HIR that is guaranteed to never match the given line terminator, +/// if possible. +/// +/// If the transformation isn't possible, then an error is returned. +/// +/// In general, if a literal line terminator occurs anywhere in the HIR, then +/// this will return an error. However, if the line terminator occurs within +/// a character class with at least one other character (that isn't also a line +/// terminator), then the line terminator is simply stripped from that class. +/// +/// If the given line terminator is not ASCII, then this function returns an +/// error. +pub fn strip_from_match( + expr: Hir, + line_term: LineTerminator, +) -> Result { + if line_term.is_crlf() { + let expr1 = strip_from_match_ascii(expr, b'\r')?; + strip_from_match_ascii(expr1, b'\n') + } else { + let b = line_term.as_byte(); + if b > 0x7F { + return Err(Error::new(ErrorKind::InvalidLineTerminator(b))); + } + strip_from_match_ascii(expr, b) + } +} + +/// The implementation of strip_from_match. The given byte must be ASCII. This +/// function panics otherwise. +fn strip_from_match_ascii( + expr: Hir, + byte: u8, +) -> Result { + assert!(byte <= 0x7F); + let chr = byte as char; + assert_eq!(chr.len_utf8(), 1); + + let invalid = || Err(Error::new(ErrorKind::NotAllowed(chr.to_string()))); + + Ok(match expr.into_kind() { + HirKind::Empty => Hir::empty(), + HirKind::Literal(hir::Literal::Unicode(c)) => { + if c == chr { + return invalid(); + } + Hir::literal(hir::Literal::Unicode(c)) + } + HirKind::Literal(hir::Literal::Byte(b)) => { + if b as char == chr { + return invalid(); + } + Hir::literal(hir::Literal::Byte(b)) + } + HirKind::Class(hir::Class::Unicode(mut cls)) => { + let remove = hir::ClassUnicode::new(Some( + hir::ClassUnicodeRange::new(chr, chr), + )); + cls.difference(&remove); + if cls.ranges().is_empty() { + return invalid(); + } + Hir::class(hir::Class::Unicode(cls)) + } + HirKind::Class(hir::Class::Bytes(mut cls)) => { + let remove = hir::ClassBytes::new(Some( + hir::ClassBytesRange::new(byte, byte), + )); + cls.difference(&remove); + if cls.ranges().is_empty() { + return invalid(); + } + Hir::class(hir::Class::Bytes(cls)) + } + HirKind::Anchor(x) => Hir::anchor(x), + HirKind::WordBoundary(x) => Hir::word_boundary(x), + HirKind::Repetition(mut x) => { + x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?); + Hir::repetition(x) + } + HirKind::Group(mut x) => { + x.hir = Box::new(strip_from_match_ascii(*x.hir, byte)?); + Hir::group(x) + } + HirKind::Concat(xs) => { + let xs = xs.into_iter() + .map(|e| strip_from_match_ascii(e, byte)) + .collect::, Error>>()?; + Hir::concat(xs) + } + HirKind::Alternation(xs) => { + let xs = xs.into_iter() + .map(|e| strip_from_match_ascii(e, byte)) + .collect::, Error>>()?; + Hir::alternation(xs) + } + }) +} + +#[cfg(test)] +mod tests { + use regex_syntax::Parser; + + use error::Error; + use super::{LineTerminator, strip_from_match}; + + fn roundtrip(pattern: &str, byte: u8) -> String { + roundtrip_line_term(pattern, LineTerminator::byte(byte)).unwrap() + } + + fn roundtrip_crlf(pattern: &str) -> String { + roundtrip_line_term(pattern, LineTerminator::crlf()).unwrap() + } + + fn roundtrip_err(pattern: &str, byte: u8) -> Result { + roundtrip_line_term(pattern, LineTerminator::byte(byte)) + } + + fn roundtrip_line_term( + pattern: &str, + line_term: LineTerminator, + ) -> Result { + let expr1 = Parser::new().parse(pattern).unwrap(); + let expr2 = strip_from_match(expr1, line_term)?; + Ok(expr2.to_string()) + } + + #[test] + fn various() { + assert_eq!(roundtrip(r"[a\n]", b'\n'), "[a]"); + assert_eq!(roundtrip(r"[a\n]", b'a'), "[\n]"); + assert_eq!(roundtrip_crlf(r"[a\n]"), "[a]"); + assert_eq!(roundtrip_crlf(r"[a\r]"), "[a]"); + assert_eq!(roundtrip_crlf(r"[a\r\n]"), "[a]"); + + assert_eq!(roundtrip(r"(?-u)\s", b'a'), r"(?-u:[\x09-\x0D\x20])"); + assert_eq!(roundtrip(r"(?-u)\s", b'\n'), r"(?-u:[\x09\x0B-\x0D\x20])"); + + assert!(roundtrip_err(r"\n", b'\n').is_err()); + assert!(roundtrip_err(r"abc\n", b'\n').is_err()); + assert!(roundtrip_err(r"\nabc", b'\n').is_err()); + assert!(roundtrip_err(r"abc\nxyz", b'\n').is_err()); + assert!(roundtrip_err(r"\x0A", b'\n').is_err()); + assert!(roundtrip_err(r"\u000A", b'\n').is_err()); + assert!(roundtrip_err(r"\U0000000A", b'\n').is_err()); + assert!(roundtrip_err(r"\u{A}", b'\n').is_err()); + assert!(roundtrip_err("\n", b'\n').is_err()); + } +} diff --git a/grep-regex/src/util.rs b/grep-regex/src/util.rs new file mode 100644 index 000000000..9b4e67c7b --- /dev/null +++ b/grep-regex/src/util.rs @@ -0,0 +1,29 @@ +/// Converts an arbitrary sequence of bytes to a literal suitable for building +/// a regular expression. +pub fn bytes_to_regex(bs: &[u8]) -> String { + use std::fmt::Write; + use regex_syntax::is_meta_character; + + let mut s = String::with_capacity(bs.len()); + for &b in bs { + if b <= 0x7F && !is_meta_character(b as char) { + write!(s, r"{}", b as char).unwrap(); + } else { + write!(s, r"\x{:02x}", b).unwrap(); + } + } + s +} + +/// Converts arbitrary bytes to a nice string. +pub fn show_bytes(bs: &[u8]) -> String { + use std::ascii::escape_default; + use std::str; + + let mut nice = String::new(); + for &b in bs { + let part: Vec = escape_default(b).collect(); + nice.push_str(str::from_utf8(&part).unwrap()); + } + nice +} diff --git a/grep-regex/src/word.rs b/grep-regex/src/word.rs new file mode 100644 index 000000000..be7400200 --- /dev/null +++ b/grep-regex/src/word.rs @@ -0,0 +1,196 @@ +use std::collections::HashMap; +use std::cell::RefCell; +use std::sync::Arc; + +use grep_matcher::{Match, Matcher, NoError}; +use regex::bytes::{CaptureLocations, Regex}; +use thread_local::CachedThreadLocal; + +use config::ConfiguredHIR; +use error::Error; +use matcher::RegexCaptures; + +/// A matcher for implementing "word match" semantics. +#[derive(Debug)] +pub struct WordMatcher { + /// The regex which is roughly `(?:^|\W)()(?:$|\W)`. + regex: Regex, + /// A map from capture group name to capture group index. + names: HashMap, + /// A reusable buffer for finding the match location of the inner group. + locs: Arc>>, +} + +impl Clone for WordMatcher { + fn clone(&self) -> WordMatcher { + // We implement Clone manually so that we get a fresh CachedThreadLocal + // such that it can set its own thread owner. This permits each thread + // usings `locs` to hit the fast path. + WordMatcher { + regex: self.regex.clone(), + names: self.names.clone(), + locs: Arc::new(CachedThreadLocal::new()), + } + } +} + +impl WordMatcher { + /// Create a new matcher from the given pattern that only produces matches + /// that are considered "words." + /// + /// The given options are used to construct the regular expression + /// internally. + pub fn new(expr: &ConfiguredHIR) -> Result { + let word_expr = expr.with_pattern(|pat| { + format!(r"(?:(?m:^)|\W)({})(?:(?m:$)|\W)", pat) + })?; + let regex = word_expr.regex()?; + let locs = Arc::new(CachedThreadLocal::new()); + + let mut names = HashMap::new(); + for (i, optional_name) in regex.capture_names().enumerate() { + if let Some(name) = optional_name { + names.insert(name.to_string(), i.checked_sub(1).unwrap()); + } + } + Ok(WordMatcher { regex, names, locs }) + } +} + +impl Matcher for WordMatcher { + type Captures = RegexCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + // To make this easy to get right, we extract captures here instead of + // calling `find_at`. The actual match is at capture group `1` instead + // of `0`. We *could* use `find_at` here and then trim the match after + // the fact, but that's a bit harder to get right, and it's not clear + // if it's worth it. + + let cell = self.locs.get_or(|| { + Box::new(RefCell::new(self.regex.capture_locations())) + }); + let mut caps = cell.borrow_mut(); + self.regex.captures_read_at(&mut caps, haystack, at); + Ok(caps.get(1).map(|m| Match::new(m.0, m.1))) + } + + fn new_captures(&self) -> Result { + Ok(RegexCaptures::with_offset(self.regex.capture_locations(), 1)) + } + + fn capture_count(&self) -> usize { + self.regex.captures_len().checked_sub(1).unwrap() + } + + fn capture_index(&self, name: &str) -> Option { + self.names.get(name).map(|i| *i) + } + + fn captures_at( + &self, + haystack: &[u8], + at: usize, + caps: &mut RegexCaptures, + ) -> Result { + let r = self.regex.captures_read_at(caps.locations(), haystack, at); + Ok(r.is_some()) + } + + // We specifically do not implement other methods like find_iter or + // captures_iter. Namely, the iter methods are guaranteed to be correct + // by virtue of implementing find_at and captures_at above. +} + +#[cfg(test)] +mod tests { + use grep_matcher::{Captures, Match, Matcher}; + use config::Config; + use super::WordMatcher; + + fn matcher(pattern: &str) -> WordMatcher { + let chir = Config::default().hir(pattern).unwrap(); + WordMatcher::new(&chir).unwrap() + } + + fn find(pattern: &str, haystack: &str) -> Option<(usize, usize)> { + matcher(pattern) + .find(haystack.as_bytes()) + .unwrap() + .map(|m| (m.start(), m.end())) + } + + fn find_by_caps(pattern: &str, haystack: &str) -> Option<(usize, usize)> { + let m = matcher(pattern); + let mut caps = m.new_captures().unwrap(); + if !m.captures(haystack.as_bytes(), &mut caps).unwrap() { + None + } else { + caps.get(0).map(|m| (m.start(), m.end())) + } + } + + // Test that the standard `find` API reports offsets correctly. + #[test] + fn various_find() { + assert_eq!(Some((0, 3)), find(r"foo", "foo")); + assert_eq!(Some((0, 3)), find(r"foo", "foo(")); + assert_eq!(Some((1, 4)), find(r"foo", "!foo(")); + assert_eq!(None, find(r"foo", "!afoo(")); + + assert_eq!(Some((0, 3)), find(r"foo", "foo☃")); + assert_eq!(None, find(r"foo", "fooб")); + // assert_eq!(Some((0, 3)), find(r"foo", "fooб")); + + // See: https://github.com/BurntSushi/ripgrep/issues/389 + assert_eq!(Some((0, 2)), find(r"-2", "-2")); + } + + // Test that the captures API also reports offsets correctly, just as + // find does. This exercises a different path in the code since captures + // are handled differently. + #[test] + fn various_captures() { + assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo")); + assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo(")); + assert_eq!(Some((1, 4)), find_by_caps(r"foo", "!foo(")); + assert_eq!(None, find_by_caps(r"foo", "!afoo(")); + + assert_eq!(Some((0, 3)), find_by_caps(r"foo", "foo☃")); + assert_eq!(None, find_by_caps(r"foo", "fooб")); + // assert_eq!(Some((0, 3)), find_by_caps(r"foo", "fooб")); + + // See: https://github.com/BurntSushi/ripgrep/issues/389 + assert_eq!(Some((0, 2)), find_by_caps(r"-2", "-2")); + } + + // Test that the capture reporting methods work as advertised. + #[test] + fn capture_indexing() { + let m = matcher(r"(a)(?Pb)(c)"); + assert_eq!(4, m.capture_count()); + assert_eq!(Some(2), m.capture_index("foo")); + + let mut caps = m.new_captures().unwrap(); + assert_eq!(4, caps.len()); + + assert!(m.captures(b"abc", &mut caps).unwrap()); + assert_eq!(caps.get(0), Some(Match::new(0, 3))); + assert_eq!(caps.get(1), Some(Match::new(0, 1))); + assert_eq!(caps.get(2), Some(Match::new(1, 2))); + assert_eq!(caps.get(3), Some(Match::new(2, 3))); + assert_eq!(caps.get(4), None); + + assert!(m.captures(b"#abc#", &mut caps).unwrap()); + assert_eq!(caps.get(0), Some(Match::new(1, 4))); + assert_eq!(caps.get(1), Some(Match::new(1, 2))); + assert_eq!(caps.get(2), Some(Match::new(2, 3))); + assert_eq!(caps.get(3), Some(Match::new(3, 4))); + assert_eq!(caps.get(4), None); + } +} diff --git a/grep-searcher/Cargo.toml b/grep-searcher/Cargo.toml new file mode 100644 index 000000000..9fad2030a --- /dev/null +++ b/grep-searcher/Cargo.toml @@ -0,0 +1,35 @@ +[package] +name = "grep-searcher" +version = "0.0.1" #:version +authors = ["Andrew Gallant "] +description = """ +Fast line oriented regex searching as a library. +""" +documentation = "https://docs.rs/grep-searcher" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "egrep", "search", "pattern"] +license = "Unlicense/MIT" + +[dependencies] +bytecount = "0.3.1" +encoding_rs = "0.8" +encoding_rs_io = "0.1.2" +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +log = "0.4" +memchr = "2" +memmap = "0.6" + +[dev-dependencies] +grep-regex = { version = "0.0.1", path = "../grep-regex" } +regex = "1" + +[features] +avx-accel = [ + "bytecount/avx-accel", +] +simd-accel = [ + "bytecount/simd-accel", + "encoding_rs/simd-accel", +] diff --git a/grep-searcher/LICENSE-MIT b/grep-searcher/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep-searcher/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep-searcher/README.md b/grep-searcher/README.md new file mode 100644 index 000000000..2cc403cbd --- /dev/null +++ b/grep-searcher/README.md @@ -0,0 +1,37 @@ +grep-searcher +------------- +A high level library for executing fast line oriented searches. This handles +things like reporting contextual lines, counting lines, inverting a search, +detecting binary data, automatic UTF-16 transcoding and deciding whether or not +to use memory maps. + +[![Linux build status](https://api.travis-ci.org/BurntSushi/ripgrep.svg)](https://travis-ci.org/BurntSushi/ripgrep) +[![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/ripgrep?svg=true)](https://ci.appveyor.com/project/BurntSushi/ripgrep) +[![](https://img.shields.io/crates/v/grep-searcher.svg)](https://crates.io/crates/grep-searcher) + +Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). + +### Documentation + +[https://docs.rs/grep-searcher](https://docs.rs/grep-searcher) + +**NOTE:** You probably don't want to use this crate directly. Instead, you +should prefer the facade defined in the +[`grep`](https://docs.rs/grep) +crate. + + +### Usage + +Add this to your `Cargo.toml`: + +```toml +[dependencies] +grep-searcher = "0.1" +``` + +and this to your crate root: + +```rust +extern crate grep_searcher; +``` diff --git a/grep-searcher/UNLICENSE b/grep-searcher/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep-searcher/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep-searcher/examples/search-stdin.rs b/grep-searcher/examples/search-stdin.rs new file mode 100644 index 000000000..9d642973f --- /dev/null +++ b/grep-searcher/examples/search-stdin.rs @@ -0,0 +1,33 @@ +extern crate grep_regex; +extern crate grep_searcher; + +use std::env; +use std::error::Error; +use std::io; +use std::process; + +use grep_regex::RegexMatcher; +use grep_searcher::Searcher; +use grep_searcher::sinks::UTF8; + +fn main() { + if let Err(err) = example() { + eprintln!("{}", err); + process::exit(1); + } +} + +fn example() -> Result<(), Box> { + let pattern = match env::args().nth(1) { + Some(pattern) => pattern, + None => return Err(From::from(format!( + "Usage: search-stdin " + ))), + }; + let matcher = RegexMatcher::new(&pattern)?; + Searcher::new().search_reader(&matcher, io::stdin(), UTF8(|lnum, line| { + print!("{}:{}", lnum, line); + Ok(true) + }))?; + Ok(()) +} diff --git a/grep-searcher/src/lib.rs b/grep-searcher/src/lib.rs new file mode 100644 index 000000000..4874b996b --- /dev/null +++ b/grep-searcher/src/lib.rs @@ -0,0 +1,135 @@ +/*! +This crate provides an implementation of line oriented search, with optional +support for multi-line search. + +# Brief overview + +The principle type in this crate is a +[`Searcher`](struct.Searcher.html), +which can be configured and built by a +[`SearcherBuilder`](struct.SearcherBuilder.html). +A `Searcher` is responsible for reading bytes from a source (e.g., a file), +executing a search of those bytes using a `Matcher` (e.g., a regex) and then +reporting the results of that search to a +[`Sink`](trait.Sink.html) +(e.g., stdout). The `Searcher` itself is principally responsible for managing +the consumption of bytes from a source and applying a `Matcher` over those +bytes in an efficient way. The `Searcher` is also responsible for inverting +a search, counting lines, reporting contextual lines, detecting binary data +and even deciding whether or not to use memory maps. + +A `Matcher` (which is defined in the +[`grep-matcher`](https://crates.io/crates/grep-matcher) +crate) is a trait for describing the lowest levels of pattern search in a +generic way. The interface itself is very similar to the interface of a regular +expression. For example, the +[`grep-regex`](https://crates.io/crates/grep-regex) +crate provides an implementation of the `Matcher` trait using Rust's +[`regex`](https://crates.io/crates/regex) +crate. + +Finally, a `Sink` describes how callers receive search results producer by a +`Searcher`. This includes routines that are called at the beginning and end of +a search, in addition to routines that are called when matching or contextual +lines are found by the `Searcher`. Implementations of `Sink` can be trivially +simple, or extraordinarily complex, such as the +`Standard` printer found in the +[`grep-printer`](https://crates.io/crates/grep-printer) +crate, which effectively implements grep-like output. +This crate also provides convenience `Sink` implementations in the +[`sinks`](sinks/index.html) +sub-module for easy searching with closures. + +# Example + +This example shows how to execute the searcher and read the search results +using the +[`UTF8`](sinks/struct.UTF8.html) +implementation of `Sink`. + +``` +extern crate grep_matcher; +extern crate grep_regex; +extern crate grep_searcher; + +use std::error::Error; + +use grep_matcher::Matcher; +use grep_regex::RegexMatcher; +use grep_searcher::Searcher; +use grep_searcher::sinks::UTF8; + +const SHERLOCK: &'static [u8] = b"\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached. +"; + +# fn main() { example().unwrap() } +fn example() -> Result<(), Box> { + let matcher = RegexMatcher::new(r"Doctor \w+")?; + let mut matches: Vec<(u64, String)> = vec![]; + Searcher::new().search_slice(&matcher, SHERLOCK, UTF8(|lnum, line| { + // We are guaranteed to find a match, so the unwrap is OK. + eprintln!("LINE: {:?}", line); + let mymatch = matcher.find(line.as_bytes())?.unwrap(); + matches.push((lnum, line[mymatch].to_string())); + Ok(true) + }))?; + + eprintln!("MATCHES: {:?}", matches); + + assert_eq!(matches.len(), 2); + assert_eq!( + matches[0], + (1, "Doctor Watsons".to_string()) + ); + assert_eq!( + matches[1], + (5, "Doctor Watson".to_string()) + ); + Ok(()) +} +``` + +See also `examples/search-stdin.rs` from the root of this crate's directory +to see a similar example that accepts a pattern on the command line and +searches stdin. +*/ + +#![deny(missing_docs)] + +extern crate bytecount; +extern crate encoding_rs; +extern crate encoding_rs_io; +extern crate grep_matcher; +#[macro_use] +extern crate log; +extern crate memchr; +extern crate memmap; +#[cfg(test)] +extern crate regex; + +pub use lines::{LineIter, LineStep}; +pub use searcher::{ + BinaryDetection, ConfigError, Encoding, MmapChoice, + Searcher, SearcherBuilder, +}; +pub use sink::{ + Sink, SinkError, + SinkContext, SinkContextKind, SinkFinish, SinkMatch, +}; +pub use sink::sinks; + +#[macro_use] +mod macros; + +mod line_buffer; +mod lines; +mod searcher; +mod sink; +#[cfg(test)] +mod testutil; diff --git a/grep-searcher/src/line_buffer.rs b/grep-searcher/src/line_buffer.rs new file mode 100644 index 000000000..0f5a2a7a7 --- /dev/null +++ b/grep-searcher/src/line_buffer.rs @@ -0,0 +1,968 @@ +use std::cmp; +use std::io; +use std::ptr; + +use memchr::{memchr, memrchr}; + +/// The default buffer capacity that we use for the line buffer. +pub(crate) const DEFAULT_BUFFER_CAPACITY: usize = 8 * (1<<10); // 8 KB + +/// The behavior of a searcher in the face of long lines and big contexts. +/// +/// When searching data incrementally using a fixed size buffer, this controls +/// the amount of *additional* memory to allocate beyond the size of the buffer +/// to accommodate lines (which may include the lines in a context window, when +/// enabled) that do not fit in the buffer. +/// +/// The default is to eagerly allocate without a limit. +#[derive(Clone, Copy, Debug)] +pub enum BufferAllocation { + /// Attempt to expand the size of the buffer until either at least the next + /// line fits into memory or until all available memory is exhausted. + /// + /// This is the default. + Eager, + /// Limit the amount of additional memory allocated to the given size. If + /// a line is found that requires more memory than is allowed here, then + /// stop reading and return an error. + Error(usize), +} + +impl Default for BufferAllocation { + fn default() -> BufferAllocation { + BufferAllocation::Eager + } +} + +/// Create a new error to be used when a configured allocation limit has been +/// reached. +pub fn alloc_error(limit: usize) -> io::Error { + let msg = format!("configured allocation limit ({}) exceeded", limit); + io::Error::new(io::ErrorKind::Other, msg) +} + +/// The behavior of binary detection in the line buffer. +/// +/// Binary detection is the process of _heuristically_ identifying whether a +/// given chunk of data is binary or not, and then taking an action based on +/// the result of that heuristic. The motivation behind detecting binary data +/// is that binary data often indicates data that is undesirable to search +/// using textual patterns. Of course, there are many cases in which this isn't +/// true, which is why binary detection is disabled by default. +#[derive(Clone, Copy, Debug)] +pub enum BinaryDetection { + /// No binary detection is performed. Data reported by the line buffer may + /// contain arbitrary bytes. + None, + /// The given byte is searched in all contents read by the line buffer. If + /// it occurs, then the data is considered binary and the line buffer acts + /// as if it reached EOF. The line buffer guarantees that this byte will + /// never be observable by callers. + Quit(u8), + /// The given byte is searched in all contents read by the line buffer. If + /// it occurs, then it is replaced by the line terminator. The line buffer + /// guarantees that this byte will never be observable by callers. + Convert(u8), +} + +impl Default for BinaryDetection { + fn default() -> BinaryDetection { + BinaryDetection::None + } +} + +impl BinaryDetection { + /// Returns true if and only if the detection heuristic demands that + /// the line buffer stop read data once binary data is observed. + fn is_quit(&self) -> bool { + match *self { + BinaryDetection::Quit(_) => true, + _ => false, + } + } +} + +/// The configuration of a buffer. This contains options that are fixed once +/// a buffer has been constructed. +#[derive(Clone, Copy, Debug)] +struct Config { + /// The number of bytes to attempt to read at a time. + capacity: usize, + /// The line terminator. + lineterm: u8, + /// The behavior for handling long lines. + buffer_alloc: BufferAllocation, + /// When set, the presence of the given byte indicates binary content. + binary: BinaryDetection, +} + +impl Default for Config { + fn default() -> Config { + Config { + capacity: DEFAULT_BUFFER_CAPACITY, + lineterm: b'\n', + buffer_alloc: BufferAllocation::default(), + binary: BinaryDetection::default(), + } + } +} + +/// A builder for constructing line buffers. +#[derive(Clone, Debug, Default)] +pub struct LineBufferBuilder { + config: Config, +} + +impl LineBufferBuilder { + /// Create a new builder for a buffer. + pub fn new() -> LineBufferBuilder { + LineBufferBuilder { config: Config::default() } + } + + /// Create a new line buffer from this builder's configuration. + pub fn build(&self) -> LineBuffer { + LineBuffer { + config: self.config, + buf: vec![0; self.config.capacity], + pos: 0, + last_lineterm: 0, + end: 0, + absolute_byte_offset: 0, + binary_byte_offset: None, + } + } + + /// Set the default capacity to use for a buffer. + /// + /// In general, the capacity of a buffer corresponds to the amount of data + /// to hold in memory, and the size of the reads to make to the underlying + /// reader. + /// + /// This is set to a reasonable default and probably shouldn't be changed + /// unless there's a specific reason to do so. + pub fn capacity(&mut self, capacity: usize) -> &mut LineBufferBuilder { + self.config.capacity = capacity; + self + } + + /// Set the line terminator for the buffer. + /// + /// Every buffer has a line terminator, and this line terminator is used + /// to determine how to roll the buffer forward. For example, when a read + /// to the buffer's underlying reader occurs, the end of the data that is + /// read is likely to correspond to an incomplete line. As a line buffer, + /// callers should not access this data since it is incomplete. The line + /// terminator is how the line buffer determines the part of the read that + /// is incomplete. + /// + /// By default, this is set to `b'\n'`. + pub fn line_terminator(&mut self, lineterm: u8) -> &mut LineBufferBuilder { + self.config.lineterm = lineterm; + self + } + + /// Set the maximum amount of additional memory to allocate for long lines. + /// + /// In order to enable line oriented search, a fundamental requirement is + /// that, at a minimum, each line must be able to fit into memory. This + /// setting controls how big that line is allowed to be. By default, this + /// is set to `BufferAllocation::Eager`, which means a line buffer will + /// attempt to allocate as much memory as possible to fit a line, and will + /// only be limited by available memory. + /// + /// Note that this setting only applies to the amount of *additional* + /// memory to allocate, beyond the capacity of the buffer. That means that + /// a value of `0` is sensible, and in particular, will guarantee that a + /// line buffer will never allocate additional memory beyond its initial + /// capacity. + pub fn buffer_alloc( + &mut self, + behavior: BufferAllocation, + ) -> &mut LineBufferBuilder { + self.config.buffer_alloc = behavior; + self + } + + /// Whether to enable binary detection or not. Depending on the setting, + /// this can either cause the line buffer to report EOF early or it can + /// cause the line buffer to clean the data. + /// + /// By default, this is disabled. In general, binary detection should be + /// viewed as an imperfect heuristic. + pub fn binary_detection( + &mut self, + detection: BinaryDetection, + ) -> &mut LineBufferBuilder { + self.config.binary = detection; + self + } +} + +/// A line buffer reader efficiently reads a line oriented buffer from an +/// arbitrary reader. +#[derive(Debug)] +pub struct LineBufferReader<'b, R> { + rdr: R, + line_buffer: &'b mut LineBuffer, +} + +impl<'b, R: io::Read> LineBufferReader<'b, R> { + /// Create a new buffered reader that reads from `rdr` and uses the given + /// `line_buffer` as an intermediate buffer. + /// + /// This does not change the binary detection behavior of the given line + /// buffer. + pub fn new( + rdr: R, + line_buffer: &'b mut LineBuffer, + ) -> LineBufferReader<'b, R> { + line_buffer.clear(); + LineBufferReader { rdr, line_buffer } + } + + /// The absolute byte offset which corresponds to the starting offsets + /// of the data returned by `buffer` relative to the beginning of the + /// underlying reader's contents. As such, this offset does not generally + /// correspond to an offset in memory. It is typically used for reporting + /// purposes. It can also be used for counting the number of bytes that + /// have been searched. + pub fn absolute_byte_offset(&self) -> u64 { + self.line_buffer.absolute_byte_offset() + } + + /// If binary data was detected, then this returns the absolute byte offset + /// at which binary data was initially found. + pub fn binary_byte_offset(&self) -> Option { + self.line_buffer.binary_byte_offset() + } + + /// Fill the contents of this buffer by discarding the part of the buffer + /// that has been consumed. The free space created by discarding the + /// consumed part of the buffer is then filled with new data from the + /// reader. + /// + /// If EOF is reached, then `false` is returned. Otherwise, `true` is + /// returned. (Note that if this line buffer's binary detection is set to + /// `Quit`, then the presence of binary data will cause this buffer to + /// behave as if it had seen EOF at the first occurrence of binary data.) + /// + /// This forwards any errors returned by the underlying reader, and will + /// also return an error if the buffer must be expanded past its allocation + /// limit, as governed by the buffer allocation strategy. + pub fn fill(&mut self) -> Result { + self.line_buffer.fill(&mut self.rdr) + } + + /// Return the contents of this buffer. + pub fn buffer(&self) -> &[u8] { + self.line_buffer.buffer() + } + + /// Consume the number of bytes provided. This must be less than or equal + /// to the number of bytes returned by `buffer`. + pub fn consume(&mut self, amt: usize) { + self.line_buffer.consume(amt); + } + + /// Consumes the remainder of the buffer. Subsequent calls to `buffer` are + /// guaranteed to return an empty slice until the buffer is refilled. + /// + /// This is a convenience function for `consume(buffer.len())`. + #[cfg(test)] + fn consume_all(&mut self) { + self.line_buffer.consume_all(); + } +} + +/// A line buffer manages a (typically fixed) buffer for holding lines. +/// +/// Callers should create line buffers sparingly and reuse them when possible. +/// Line buffers cannot be used directly, but instead must be used via the +/// LineBufferReader. +#[derive(Clone, Debug)] +pub struct LineBuffer { + /// The configuration of this buffer. + config: Config, + /// The primary buffer with which to hold data. + buf: Vec, + /// The current position of this buffer. This is always a valid sliceable + /// index into `buf`, and its maximum value is the length of `buf`. + pos: usize, + /// The end position of searchable content in this buffer. This is either + /// set to just after the final line terminator in the buffer, or to just + /// after the end of the last byte emitted by the reader when the reader + /// has been exhausted. + last_lineterm: usize, + /// The end position of the buffer. This is always greater than or equal to + /// last_lineterm. The bytes between last_lineterm and end, if any, always + /// correspond to a partial line. + end: usize, + /// The absolute byte offset corresponding to `pos`. This is most typically + /// not a valid index into addressable memory, but rather, an offset that + /// is relative to all data that passes through a line buffer (since + /// construction or since the last time `clear` was called). + /// + /// When the line buffer reaches EOF, this is set to the position just + /// after the last byte read from the underlying reader. That is, it + /// becomes the total count of bytes that have been read. + absolute_byte_offset: u64, + /// If binary data was found, this records the absolute byte offset at + /// which it was first detected. + binary_byte_offset: Option, +} + +impl LineBuffer { + /// Reset this buffer, such that it can be used with a new reader. + fn clear(&mut self) { + self.pos = 0; + self.last_lineterm = 0; + self.end = 0; + self.absolute_byte_offset = 0; + self.binary_byte_offset = None; + } + + /// The absolute byte offset which corresponds to the starting offsets + /// of the data returned by `buffer` relative to the beginning of the + /// reader's contents. As such, this offset does not generally correspond + /// to an offset in memory. It is typically used for reporting purposes, + /// particularly in error messages. + /// + /// This is reset to `0` when `clear` is called. + fn absolute_byte_offset(&self) -> u64 { + self.absolute_byte_offset + } + + /// If binary data was detected, then this returns the absolute byte offset + /// at which binary data was initially found. + fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset + } + + /// Return the contents of this buffer. + fn buffer(&self) -> &[u8] { + &self.buf[self.pos..self.last_lineterm] + } + + /// Return the contents of the free space beyond the end of the buffer as + /// a mutable slice. + fn free_buffer(&mut self) -> &mut [u8] { + &mut self.buf[self.end..] + } + + /// Consume the number of bytes provided. This must be less than or equal + /// to the number of bytes returned by `buffer`. + fn consume(&mut self, amt: usize) { + assert!(amt <= self.buffer().len()); + self.pos += amt; + self.absolute_byte_offset += amt as u64; + } + + /// Consumes the remainder of the buffer. Subsequent calls to `buffer` are + /// guaranteed to return an empty slice until the buffer is refilled. + /// + /// This is a convenience function for `consume(buffer.len())`. + #[cfg(test)] + fn consume_all(&mut self) { + let amt = self.buffer().len(); + self.consume(amt); + } + + /// Fill the contents of this buffer by discarding the part of the buffer + /// that has been consumed. The free space created by discarding the + /// consumed part of the buffer is then filled with new data from the given + /// reader. + /// + /// Callers should provide the same reader to this line buffer in + /// subsequent calls to fill. A different reader can only be used + /// immediately following a call to `clear`. + /// + /// If EOF is reached, then `false` is returned. Otherwise, `true` is + /// returned. (Note that if this line buffer's binary detection is set to + /// `Quit`, then the presence of binary data will cause this buffer to + /// behave as if it had seen EOF.) + /// + /// This forwards any errors returned by `rdr`, and will also return an + /// error if the buffer must be expanded past its allocation limit, as + /// governed by the buffer allocation strategy. + fn fill(&mut self, mut rdr: R) -> Result { + // If the binary detection heuristic tells us to quit once binary data + // has been observed, then we no longer read new data and reach EOF + // once the current buffer has been consumed. + if self.config.binary.is_quit() && self.binary_byte_offset.is_some() { + return Ok(!self.buffer().is_empty()); + } + + self.roll(); + assert_eq!(self.pos, 0); + loop { + self.ensure_capacity()?; + let readlen = rdr.read(self.free_buffer())?; + if readlen == 0 { + // We're only done reading for good once the caller has + // consumed everything. + self.last_lineterm = self.end; + return Ok(!self.buffer().is_empty()); + } + + // Get a mutable view into the bytes we've just read. These are + // the bytes that we do binary detection on, and also the bytes we + // search to find the last line terminator. We need a mutable slice + // in the case of binary conversion. + let oldend = self.end; + self.end += readlen; + let newbytes = &mut self.buf[oldend..self.end]; + + // Binary detection. + match self.config.binary { + BinaryDetection::None => {} // nothing to do + BinaryDetection::Quit(byte) => { + if let Some(i) = memchr(byte, newbytes) { + self.end = oldend + i; + self.last_lineterm = self.end; + self.binary_byte_offset = + Some(self.absolute_byte_offset + self.end as u64); + // If the first byte in our buffer is a binary byte, + // then our buffer is empty and we should report as + // such to the caller. + return Ok(self.pos < self.end); + } + } + BinaryDetection::Convert(byte) => { + if let Some(i) = replace_bytes( + newbytes, + byte, + self.config.lineterm, + ) { + // Record only the first binary offset. + if self.binary_byte_offset.is_none() { + self.binary_byte_offset = + Some(self.absolute_byte_offset + + (oldend + i) as u64); + } + } + } + } + + // Update our `last_lineterm` positions if we read one. + if let Some(i) = memrchr(self.config.lineterm, newbytes) { + self.last_lineterm = oldend + i + 1; + return Ok(true); + } + // At this point, if we couldn't find a line terminator, then we + // don't have a complete line. Therefore, we try to read more! + } + } + + /// Roll the unconsumed parts of the buffer to the front. + /// + /// This operation is idempotent. + /// + /// After rolling, `last_lineterm` and `end` point to the same location, + /// and `pos` is always set to `0`. + fn roll(&mut self) { + if self.pos == self.end { + self.pos = 0; + self.last_lineterm = 0; + self.end = 0; + return; + } + + assert!(self.pos < self.end && self.end <= self.buf.len()); + let roll_len = self.end - self.pos; + unsafe { + // SAFETY: A buffer contains Copy data, so there's no problem + // moving it around. Safety also depends on our indices being + // in bounds, which they should always be, and we enforce with + // an assert above. + // + // It seems like it should be possible to do this in safe code that + // results in the same codegen. I tried the obvious: + // + // for (src, dst) in (self.pos..self.end).zip(0..) { + // self.buf[dst] = self.buf[src]; + // } + // + // But the above does not work, and in fact compiles down to a slow + // byte-by-byte loop. I tried a few other minor variations, but + // alas, better minds might prevail. + // + // Overall, this doesn't save us *too* much. It mostly matters when + // the number of bytes we're copying is large, which can happen + // if the searcher is asked to produce a lot of context. We could + // decide this isn't worth it, but it does make an appreciable + // impact at or around the context=30 range on my machine. + // + // We could also use a temporary buffer that compiles down to two + // memcpys and is faster than the byte-at-a-time loop, but it + // complicates our options for limiting memory allocation a bit. + ptr::copy( + self.buf[self.pos..].as_ptr(), + self.buf.as_mut_ptr(), + roll_len, + ); + } + self.pos = 0; + self.last_lineterm = roll_len; + self.end = roll_len; + } + + /// Ensures that the internal buffer has a non-zero amount of free space + /// in which to read more data. If there is no free space, then more is + /// allocated. If the allocation must exceed the configured limit, then + /// this returns an error. + fn ensure_capacity(&mut self) -> Result<(), io::Error> { + if !self.free_buffer().is_empty() { + return Ok(()); + } + // `len` is used for computing the next allocation size. The capacity + // is permitted to start at `0`, so we make sure it's at least `1`. + let len = cmp::max(1, self.buf.len()); + let additional = match self.config.buffer_alloc { + BufferAllocation::Eager => len * 2, + BufferAllocation::Error(limit) => { + let used = self.buf.len() - self.config.capacity; + let n = cmp::min(len * 2, limit - used); + if n == 0 { + return Err(alloc_error(self.config.capacity + limit)); + } + n + } + }; + assert!(additional > 0); + let newlen = self.buf.len() + additional; + self.buf.resize(newlen, 0); + assert!(!self.free_buffer().is_empty()); + Ok(()) + } +} + +/// Replaces `src` with `replacement` in bytes. +fn replace_bytes(bytes: &mut [u8], src: u8, replacement: u8) -> Option { + if src == replacement { + return None; + } + let mut first_pos = None; + let mut pos = 0; + while let Some(i) = memchr(src, &bytes[pos..]).map(|i| pos + i) { + if first_pos.is_none() { + first_pos = Some(i); + } + bytes[i] = replacement; + pos = i + 1; + while bytes.get(pos) == Some(&src) { + bytes[pos] = replacement; + pos += 1; + } + } + first_pos +} + +#[cfg(test)] +mod tests { + use std::str; + use super::*; + + const SHERLOCK: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + + fn s(slice: &str) -> String { + slice.to_string() + } + + fn btos(slice: &[u8]) -> &str { + str::from_utf8(slice).unwrap() + } + + fn replace_str( + slice: &str, + src: u8, + replacement: u8, + ) -> (String, Option) { + let mut dst = slice.to_string().into_bytes(); + let result = replace_bytes(&mut dst, src, replacement); + (String::from_utf8(dst).unwrap(), result) + } + + #[test] + fn replace() { + assert_eq!(replace_str("abc", b'b', b'z'), (s("azc"), Some(1))); + assert_eq!(replace_str("abb", b'b', b'z'), (s("azz"), Some(1))); + assert_eq!(replace_str("aba", b'a', b'z'), (s("zbz"), Some(0))); + assert_eq!(replace_str("bbb", b'b', b'z'), (s("zzz"), Some(0))); + assert_eq!(replace_str("bac", b'b', b'z'), (s("zac"), Some(0))); + } + + #[test] + fn buffer_basics1() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\n"); + assert_eq!(rdr.absolute_byte_offset(), 0); + rdr.consume(5); + assert_eq!(rdr.absolute_byte_offset(), 5); + rdr.consume_all(); + assert_eq!(rdr.absolute_byte_offset(), 11); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "maggie"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_basics2() { + let bytes = "homer\nlisa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_basics3() { + let bytes = "\n"; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_basics4() { + let bytes = "\n\n"; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "\n\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_empty() { + let bytes = ""; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_zero_capacity() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new().capacity(0).build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + while rdr.fill().unwrap() { + rdr.consume_all(); + } + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_small_capacity() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new().capacity(1).build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + let mut got = vec![]; + while rdr.fill().unwrap() { + got.extend(rdr.buffer()); + rdr.consume_all(); + } + assert_eq!(bytes, btos(&got)); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_limited_capacity1() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new() + .capacity(1) + .buffer_alloc(BufferAllocation::Error(5)) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\n"); + rdr.consume_all(); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "lisa\n"); + rdr.consume_all(); + + // This returns an error because while we have just enough room to + // store maggie in the buffer, we *don't* have enough room to read one + // more byte, so we don't know whether we're at EOF or not, and + // therefore must give up. + assert!(rdr.fill().is_err()); + + // We can mush on though! + assert_eq!(btos(rdr.buffer()), "m"); + rdr.consume_all(); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "aggie"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + } + + #[test] + fn buffer_limited_capacity2() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new() + .capacity(1) + .buffer_alloc(BufferAllocation::Error(6)) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\n"); + rdr.consume_all(); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "lisa\n"); + rdr.consume_all(); + + // We have just enough space. + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "maggie"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + } + + #[test] + fn buffer_limited_capacity3() { + let bytes = "homer\nlisa\nmaggie"; + let mut linebuf = LineBufferBuilder::new() + .capacity(1) + .buffer_alloc(BufferAllocation::Error(0)) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.fill().is_err()); + assert_eq!(btos(rdr.buffer()), ""); + } + + #[test] + fn buffer_binary_none() { + let bytes = "homer\nli\x00sa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new().build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nli\x00sa\nmaggie\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), None); + } + + #[test] + fn buffer_binary_quit1() { + let bytes = "homer\nli\x00sa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Quit(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nli"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), 8); + assert_eq!(rdr.binary_byte_offset(), Some(8)); + } + + #[test] + fn buffer_binary_quit2() { + let bytes = "\x00homer\nlisa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Quit(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(!rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), ""); + assert_eq!(rdr.absolute_byte_offset(), 0); + assert_eq!(rdr.binary_byte_offset(), Some(0)); + } + + #[test] + fn buffer_binary_quit3() { + let bytes = "homer\nlisa\nmaggie\n\x00"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Quit(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 1); + assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1)); + } + + #[test] + fn buffer_binary_quit4() { + let bytes = "homer\nlisa\nmaggie\x00\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Quit(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64 - 2); + assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2)); + } + + #[test] + fn buffer_binary_quit5() { + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Quit(b'u')) + .build(); + let mut rdr = LineBufferReader::new(SHERLOCK.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, s\ +"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), 76); + assert_eq!(rdr.binary_byte_offset(), Some(76)); + assert_eq!(SHERLOCK.as_bytes()[76], b'u'); + } + + #[test] + fn buffer_binary_convert1() { + let bytes = "homer\nli\x00sa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Convert(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nli\nsa\nmaggie\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), Some(8)); + } + + #[test] + fn buffer_binary_convert2() { + let bytes = "\x00homer\nlisa\nmaggie\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Convert(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "\nhomer\nlisa\nmaggie\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), Some(0)); + } + + #[test] + fn buffer_binary_convert3() { + let bytes = "homer\nlisa\nmaggie\n\x00"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Convert(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 1)); + } + + #[test] + fn buffer_binary_convert4() { + let bytes = "homer\nlisa\nmaggie\x00\n"; + let mut linebuf = LineBufferBuilder::new() + .binary_detection(BinaryDetection::Convert(b'\x00')) + .build(); + let mut rdr = LineBufferReader::new(bytes.as_bytes(), &mut linebuf); + + assert!(rdr.buffer().is_empty()); + + assert!(rdr.fill().unwrap()); + assert_eq!(btos(rdr.buffer()), "homer\nlisa\nmaggie\n\n"); + rdr.consume_all(); + + assert!(!rdr.fill().unwrap()); + assert_eq!(rdr.absolute_byte_offset(), bytes.len() as u64); + assert_eq!(rdr.binary_byte_offset(), Some(bytes.len() as u64 - 2)); + } +} diff --git a/grep-searcher/src/lines.rs b/grep-searcher/src/lines.rs new file mode 100644 index 000000000..ed225a429 --- /dev/null +++ b/grep-searcher/src/lines.rs @@ -0,0 +1,462 @@ +/*! +A collection of routines for performing operations on lines. +*/ + +use bytecount; +use memchr::{memchr, memrchr}; +use grep_matcher::{LineTerminator, Match}; + +/// An iterator over lines in a particular slice of bytes. +/// +/// Line terminators are considered part of the line they terminate. All lines +/// yielded by the iterator are guaranteed to be non-empty. +/// +/// `'b` refers to the lifetime of the underlying bytes. +#[derive(Debug)] +pub struct LineIter<'b> { + bytes: &'b [u8], + stepper: LineStep, +} + +impl<'b> LineIter<'b> { + /// Create a new line iterator that yields lines in the given bytes that + /// are terminated by `line_term`. + pub fn new(line_term: u8, bytes: &'b [u8]) -> LineIter<'b> { + LineIter { + bytes: bytes, + stepper: LineStep::new(line_term, 0, bytes.len()), + } + } +} + +impl<'b> Iterator for LineIter<'b> { + type Item = &'b [u8]; + + fn next(&mut self) -> Option<&'b [u8]> { + self.stepper.next_match(self.bytes).map(|m| &self.bytes[m]) + } +} + +/// An explicit iterator over lines in a particular slice of bytes. +/// +/// This iterator avoids borrowing the bytes themselves, and instead requires +/// callers to explicitly provide the bytes when moving through the iterator. +/// While not idiomatic, this provides a simple way of iterating over lines +/// that doesn't require borrowing the slice itself, which can be convenient. +/// +/// Line terminators are considered part of the line they terminate. All lines +/// yielded by the iterator are guaranteed to be non-empty. +#[derive(Debug)] +pub struct LineStep { + line_term: u8, + pos: usize, + end: usize, +} + +impl LineStep { + /// Create a new line iterator over the given range of bytes using the + /// given line terminator. + /// + /// Callers should provide the actual bytes for each call to `next`. The + /// same slice must be provided to each call. + /// + /// This panics if `start` is not less than or equal to `end`. + pub fn new(line_term: u8, start: usize, end: usize) -> LineStep { + LineStep { line_term, pos: start, end: end } + } + + /// Return the start and end position of the next line in the given bytes. + /// + /// The caller must past exactly the same slice of bytes for each call to + /// `next`. + /// + /// The range returned includes the line terminator. Ranges are always + /// non-empty. + pub fn next(&mut self, bytes: &[u8]) -> Option<(usize, usize)> { + self.next_impl(bytes) + } + + /// Like next, but returns a `Match` instead of a tuple. + #[inline(always)] + pub(crate) fn next_match(&mut self, bytes: &[u8]) -> Option { + self.next_impl(bytes).map(|(s, e)| Match::new(s, e)) + } + + #[inline(always)] + fn next_impl(&mut self, mut bytes: &[u8]) -> Option<(usize, usize)> { + bytes = &bytes[..self.end]; + match memchr(self.line_term, &bytes[self.pos..]) { + None => { + if self.pos < bytes.len() { + let m = (self.pos, bytes.len()); + assert!(m.0 <= m.1); + + self.pos = m.1; + Some(m) + } else { + None + } + } + Some(line_end) => { + let m = (self.pos, self.pos + line_end + 1); + assert!(m.0 <= m.1); + + self.pos = m.1; + Some(m) + } + } + } +} + +/// Count the number of occurrences of `line_term` in `bytes`. +pub fn count(bytes: &[u8], line_term: u8) -> u64 { + bytecount::count(bytes, line_term) as u64 +} + +/// Given a line that possibly ends with a terminator, return that line without +/// the terminator. +#[inline(always)] +pub fn without_terminator(bytes: &[u8], line_term: LineTerminator) -> &[u8] { + let line_term = line_term.as_bytes(); + let start = bytes.len().saturating_sub(line_term.len()); + if bytes.get(start..) == Some(line_term) { + return &bytes[..bytes.len() - line_term.len()]; + } + bytes +} + +/// Return the start and end offsets of the lines containing the given range +/// of bytes. +/// +/// Line terminators are considered part of the line they terminate. +#[inline(always)] +pub fn locate( + bytes: &[u8], + line_term: u8, + range: Match, +) -> Match { + let line_start = memrchr(line_term, &bytes[0..range.start()]) + .map_or(0, |i| i + 1); + let line_end = + if range.end() > line_start && bytes[range.end() - 1] == line_term { + range.end() + } else { + memchr(line_term, &bytes[range.end()..]) + .map_or(bytes.len(), |i| range.end() + i + 1) + }; + Match::new(line_start, line_end) +} + +/// Returns the minimal starting offset of the line that occurs `count` lines +/// before the last line in `bytes`. +/// +/// Lines are terminated by `line_term`. If `count` is zero, then this returns +/// the starting offset of the last line in `bytes`. +/// +/// If `bytes` ends with a line terminator, then the terminator itself is +/// considered part of the last line. +pub fn preceding(bytes: &[u8], line_term: u8, count: usize) -> usize { + preceding_by_pos(bytes, bytes.len(), line_term, count) +} + +/// Returns the minimal starting offset of the line that occurs `count` lines +/// before the line containing `pos`. Lines are terminated by `line_term`. +/// If `count` is zero, then this returns the starting offset of the line +/// containing `pos`. +/// +/// If `pos` points just past a line terminator, then it is considered part of +/// the line that it terminates. For example, given `bytes = b"abc\nxyz\n"` +/// and `pos = 7`, `preceding(bytes, pos, b'\n', 0)` returns `4` (as does `pos +/// = 8`) and `preceding(bytes, pos, `b'\n', 1)` returns `0`. +fn preceding_by_pos( + bytes: &[u8], + mut pos: usize, + line_term: u8, + mut count: usize, +) -> usize { + if pos == 0 { + return 0; + } else if bytes[pos - 1] == line_term { + pos -= 1; + } + loop { + match memrchr(line_term, &bytes[..pos]) { + None => { + return 0; + } + Some(i) => { + if count == 0 { + return i + 1; + } else if i == 0 { + return 0; + } + count -= 1; + pos = i; + } + } + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + use std::str; + use grep_matcher::Match; + use super::*; + + const SHERLOCK: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + + fn m(start: usize, end: usize) -> Match { + Match::new(start, end) + } + + fn lines(text: &str) -> Vec<&str> { + let mut results = vec![]; + let mut it = LineStep::new(b'\n', 0, text.len()); + while let Some(m) = it.next_match(text.as_bytes()) { + results.push(&text[m]); + } + results + } + + fn line_ranges(text: &str) -> Vec> { + let mut results = vec![]; + let mut it = LineStep::new(b'\n', 0, text.len()); + while let Some(m) = it.next_match(text.as_bytes()) { + results.push(m.start()..m.end()); + } + results + } + + fn prev(text: &str, pos: usize, count: usize) -> usize { + preceding_by_pos(text.as_bytes(), pos, b'\n', count) + } + + fn loc(text: &str, start: usize, end: usize) -> Match { + locate(text.as_bytes(), b'\n', Match::new(start, end)) + } + + #[test] + fn line_count() { + assert_eq!(0, count(b"", b'\n')); + assert_eq!(1, count(b"\n", b'\n')); + assert_eq!(2, count(b"\n\n", b'\n')); + assert_eq!(2, count(b"a\nb\nc", b'\n')); + } + + #[test] + fn line_locate() { + let t = SHERLOCK; + let lines = line_ranges(t); + + assert_eq!( + loc(t, lines[0].start, lines[0].end), + m(lines[0].start, lines[0].end)); + assert_eq!( + loc(t, lines[0].start + 1, lines[0].end), + m(lines[0].start, lines[0].end)); + assert_eq!( + loc(t, lines[0].end - 1, lines[0].end), + m(lines[0].start, lines[0].end)); + assert_eq!( + loc(t, lines[0].end, lines[0].end), + m(lines[1].start, lines[1].end)); + + assert_eq!( + loc(t, lines[5].start, lines[5].end), + m(lines[5].start, lines[5].end)); + assert_eq!( + loc(t, lines[5].start + 1, lines[5].end), + m(lines[5].start, lines[5].end)); + assert_eq!( + loc(t, lines[5].end - 1, lines[5].end), + m(lines[5].start, lines[5].end)); + assert_eq!( + loc(t, lines[5].end, lines[5].end), + m(lines[5].start, lines[5].end)); + } + + #[test] + fn line_locate_weird() { + assert_eq!(loc("", 0, 0), m(0, 0)); + + assert_eq!(loc("\n", 0, 1), m(0, 1)); + assert_eq!(loc("\n", 1, 1), m(1, 1)); + + assert_eq!(loc("\n\n", 0, 0), m(0, 1)); + assert_eq!(loc("\n\n", 0, 1), m(0, 1)); + assert_eq!(loc("\n\n", 1, 1), m(1, 2)); + assert_eq!(loc("\n\n", 1, 2), m(1, 2)); + assert_eq!(loc("\n\n", 2, 2), m(2, 2)); + + assert_eq!(loc("a\nb\nc", 0, 1), m(0, 2)); + assert_eq!(loc("a\nb\nc", 1, 2), m(0, 2)); + assert_eq!(loc("a\nb\nc", 2, 3), m(2, 4)); + assert_eq!(loc("a\nb\nc", 3, 4), m(2, 4)); + assert_eq!(loc("a\nb\nc", 4, 5), m(4, 5)); + assert_eq!(loc("a\nb\nc", 5, 5), m(4, 5)); + } + + #[test] + fn line_iter() { + assert_eq!(lines("abc"), vec!["abc"]); + + assert_eq!(lines("abc\n"), vec!["abc\n"]); + assert_eq!(lines("abc\nxyz"), vec!["abc\n", "xyz"]); + assert_eq!(lines("abc\nxyz\n"), vec!["abc\n", "xyz\n"]); + + assert_eq!(lines("abc\n\n"), vec!["abc\n", "\n"]); + assert_eq!(lines("abc\n\n\n"), vec!["abc\n", "\n", "\n"]); + assert_eq!(lines("abc\n\nxyz"), vec!["abc\n", "\n", "xyz"]); + assert_eq!(lines("abc\n\nxyz\n"), vec!["abc\n", "\n", "xyz\n"]); + assert_eq!(lines("abc\nxyz\n\n"), vec!["abc\n", "xyz\n", "\n"]); + + assert_eq!(lines("\n"), vec!["\n"]); + assert_eq!(lines(""), Vec::<&str>::new()); + } + + #[test] + fn line_iter_empty() { + let mut it = LineStep::new(b'\n', 0, 0); + assert_eq!(it.next(b"abc"), None); + } + + #[test] + fn preceding_lines_doc() { + // These are the examples mentions in the documentation of `preceding`. + let bytes = b"abc\nxyz\n"; + assert_eq!(4, preceding_by_pos(bytes, 7, b'\n', 0)); + assert_eq!(4, preceding_by_pos(bytes, 8, b'\n', 0)); + assert_eq!(0, preceding_by_pos(bytes, 7, b'\n', 1)); + assert_eq!(0, preceding_by_pos(bytes, 8, b'\n', 1)); + } + + #[test] + fn preceding_lines_sherlock() { + let t = SHERLOCK; + let lines = line_ranges(t); + + // The following tests check the count == 0 case, i.e., finding the + // beginning of the line containing the given position. + assert_eq!(0, prev(t, 0, 0)); + assert_eq!(0, prev(t, 1, 0)); + // The line terminator is addressed by `end-1` and terminates the line + // it is part of. + assert_eq!(0, prev(t, lines[0].end - 1, 0)); + assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); + // The end position of line addresses the byte immediately following a + // line terminator, which puts it on the following line. + assert_eq!(lines[1].start, prev(t, lines[0].end + 1, 0)); + + // Now tests for count > 0. + assert_eq!(0, prev(t, 0, 1)); + assert_eq!(0, prev(t, 0, 2)); + assert_eq!(0, prev(t, 1, 1)); + assert_eq!(0, prev(t, 1, 2)); + assert_eq!(0, prev(t, lines[0].end - 1, 1)); + assert_eq!(0, prev(t, lines[0].end - 1, 2)); + assert_eq!(0, prev(t, lines[0].end, 1)); + assert_eq!(0, prev(t, lines[0].end, 2)); + assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1)); + assert_eq!(lines[3].start, prev(t, lines[4].end, 1)); + assert_eq!(lines[4].start, prev(t, lines[4].end + 1, 1)); + + // The last line has no line terminator. + assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); + assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0)); + assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); + } + + #[test] + fn preceding_lines_short() { + let t = "a\nb\nc\nd\ne\nf\n"; + let lines = line_ranges(t); + assert_eq!(12, t.len()); + + assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); + assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); + assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); + assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); + assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); + + assert_eq!(lines[5].start, prev(t, lines[5].end - 1, 0)); + assert_eq!(lines[4].start, prev(t, lines[5].end - 1, 1)); + assert_eq!(lines[3].start, prev(t, lines[5].end - 1, 2)); + assert_eq!(lines[2].start, prev(t, lines[5].end - 1, 3)); + assert_eq!(lines[1].start, prev(t, lines[5].end - 1, 4)); + assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 5)); + assert_eq!(lines[0].start, prev(t, lines[5].end - 1, 6)); + + assert_eq!(lines[4].start, prev(t, lines[5].start, 0)); + assert_eq!(lines[3].start, prev(t, lines[5].start, 1)); + assert_eq!(lines[2].start, prev(t, lines[5].start, 2)); + assert_eq!(lines[1].start, prev(t, lines[5].start, 3)); + assert_eq!(lines[0].start, prev(t, lines[5].start, 4)); + assert_eq!(lines[0].start, prev(t, lines[5].start, 5)); + + assert_eq!(lines[3].start, prev(t, lines[4].end - 1, 1)); + assert_eq!(lines[2].start, prev(t, lines[4].start, 1)); + + assert_eq!(lines[2].start, prev(t, lines[3].end - 1, 1)); + assert_eq!(lines[1].start, prev(t, lines[3].start, 1)); + + assert_eq!(lines[1].start, prev(t, lines[2].end - 1, 1)); + assert_eq!(lines[0].start, prev(t, lines[2].start, 1)); + + assert_eq!(lines[0].start, prev(t, lines[1].end - 1, 1)); + assert_eq!(lines[0].start, prev(t, lines[1].start, 1)); + + assert_eq!(lines[0].start, prev(t, lines[0].end - 1, 1)); + assert_eq!(lines[0].start, prev(t, lines[0].start, 1)); + } + + #[test] + fn preceding_lines_empty1() { + let t = "\n\n\nd\ne\nf\n"; + let lines = line_ranges(t); + assert_eq!(9, t.len()); + + assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); + assert_eq!(lines[0].start, prev(t, lines[0].end, 1)); + assert_eq!(lines[1].start, prev(t, lines[1].end, 0)); + assert_eq!(lines[0].start, prev(t, lines[1].end, 1)); + + assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); + assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); + assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); + assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); + assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); + } + + #[test] + fn preceding_lines_empty2() { + let t = "a\n\n\nd\ne\nf\n"; + let lines = line_ranges(t); + assert_eq!(10, t.len()); + + assert_eq!(lines[0].start, prev(t, lines[0].end, 0)); + assert_eq!(lines[0].start, prev(t, lines[0].end, 1)); + assert_eq!(lines[1].start, prev(t, lines[1].end, 0)); + assert_eq!(lines[0].start, prev(t, lines[1].end, 1)); + + assert_eq!(lines[5].start, prev(t, lines[5].end, 0)); + assert_eq!(lines[4].start, prev(t, lines[5].end, 1)); + assert_eq!(lines[3].start, prev(t, lines[5].end, 2)); + assert_eq!(lines[2].start, prev(t, lines[5].end, 3)); + assert_eq!(lines[1].start, prev(t, lines[5].end, 4)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 5)); + assert_eq!(lines[0].start, prev(t, lines[5].end, 6)); + } +} diff --git a/grep-searcher/src/macros.rs b/grep-searcher/src/macros.rs new file mode 100644 index 000000000..a4af7eaa0 --- /dev/null +++ b/grep-searcher/src/macros.rs @@ -0,0 +1,24 @@ +#[cfg(test)] +#[macro_export] +macro_rules! assert_eq_printed { + ($expected:expr, $got:expr, $($tt:tt)*) => { + let expected = &*$expected; + let got = &*$got; + let label = format!($($tt)*); + if expected != got { + panic!(" +printed outputs differ! (label: {}) + +expected: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +{} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +got: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +{} +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +", label, expected, got); + } + } +} diff --git a/grep-searcher/src/searcher/core.rs b/grep-searcher/src/searcher/core.rs new file mode 100644 index 000000000..21dbae373 --- /dev/null +++ b/grep-searcher/src/searcher/core.rs @@ -0,0 +1,592 @@ +use std::cmp; + +use memchr::memchr; + +use grep_matcher::{LineMatchKind, Matcher}; +use lines::{self, LineStep}; +use line_buffer::BinaryDetection; +use searcher::{Config, Range, Searcher}; +use sink::{ + Sink, SinkError, + SinkFinish, SinkContext, SinkContextKind, SinkMatch, +}; + +#[derive(Debug)] +pub struct Core<'s, M: 's, S> { + config: &'s Config, + matcher: M, + searcher: &'s Searcher, + sink: S, + binary: bool, + pos: usize, + absolute_byte_offset: u64, + binary_byte_offset: Option, + line_number: Option, + last_line_counted: usize, + last_line_visited: usize, + after_context_left: usize, + has_sunk: bool, +} + +impl<'s, M: Matcher, S: Sink> Core<'s, M, S> { + pub fn new( + searcher: &'s Searcher, + matcher: M, + sink: S, + binary: bool, + ) -> Core<'s, M, S> { + let line_number = + if searcher.config.line_number { + Some(1) + } else { + None + }; + let core = Core { + config: &searcher.config, + matcher: matcher, + searcher: searcher, + sink: sink, + binary: binary, + pos: 0, + absolute_byte_offset: 0, + binary_byte_offset: None, + line_number: line_number, + last_line_counted: 0, + last_line_visited: 0, + after_context_left: 0, + has_sunk: false, + }; + if !core.searcher.multi_line_with_matcher(&core.matcher) { + if core.is_line_by_line_fast() { + trace!("searcher core: will use fast line searcher"); + } else { + trace!("searcher core: will use slow line searcher"); + } + } + core + } + + pub fn pos(&self) -> usize { + self.pos + } + + pub fn set_pos(&mut self, pos: usize) { + self.pos = pos; + } + + pub fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset.map(|offset| offset as u64) + } + + pub fn matcher(&self) -> &M { + &self.matcher + } + + pub fn matched( + &mut self, + buf: &[u8], + range: &Range, + ) -> Result { + self.sink_matched(buf, range) + } + + pub fn begin(&mut self) -> Result { + self.sink.begin(&self.searcher) + } + + pub fn finish( + &mut self, + byte_count: u64, + binary_byte_offset: Option, + ) -> Result<(), S::Error> { + self.sink.finish( + &self.searcher, + &SinkFinish { + byte_count, + binary_byte_offset, + }) + } + + pub fn match_by_line(&mut self, buf: &[u8]) -> Result { + if self.is_line_by_line_fast() { + self.match_by_line_fast(buf) + } else { + self.match_by_line_slow(buf) + } + } + + pub fn roll(&mut self, buf: &[u8]) -> usize { + let consumed = + if self.config.max_context() == 0 { + buf.len() + } else { + // It might seem like all we need to care about here is just + // the "before context," but in order to sink the context + // separator (when before_context==0 and after_context>0), we + // need to know something about the position of the previous + // line visited, even if we're at the beginning of the buffer. + let context_start = lines::preceding( + buf, + self.config.line_term.as_byte(), + self.config.max_context(), + ); + let consumed = cmp::max(context_start, self.last_line_visited); + consumed + }; + self.count_lines(buf, consumed); + self.absolute_byte_offset += consumed as u64; + self.last_line_counted = 0; + self.last_line_visited = 0; + self.set_pos(buf.len() - consumed); + consumed + } + + pub fn detect_binary(&mut self, buf: &[u8], range: &Range) -> bool { + if self.binary_byte_offset.is_some() { + return true; + } + let binary_byte = match self.config.binary.0 { + BinaryDetection::Quit(b) => b, + _ => return false, + }; + if let Some(i) = memchr(binary_byte, &buf[*range]) { + self.binary_byte_offset = Some(range.start() + i); + true + } else { + false + } + } + + pub fn before_context_by_line( + &mut self, + buf: &[u8], + upto: usize, + ) -> Result { + if self.config.before_context == 0 { + return Ok(true); + } + let range = Range::new(self.last_line_visited, upto); + if range.is_empty() { + return Ok(true); + } + let before_context_start = range.start() + lines::preceding( + &buf[range], + self.config.line_term.as_byte(), + self.config.before_context - 1, + ); + + let range = Range::new(before_context_start, range.end()); + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + range.start(), + range.end(), + ); + while let Some(line) = stepper.next_match(buf) { + if !self.sink_break_context(line.start())? { + return Ok(false); + } + if !self.sink_before_context(buf, &line)? { + return Ok(false); + } + } + Ok(true) + } + + pub fn after_context_by_line( + &mut self, + buf: &[u8], + upto: usize, + ) -> Result { + if self.after_context_left == 0 { + return Ok(true); + } + let range = Range::new(self.last_line_visited, upto); + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + range.start(), + range.end(), + ); + while let Some(line) = stepper.next_match(buf) { + if !self.sink_after_context(buf, &line)? { + return Ok(false); + } + if self.after_context_left == 0 { + break; + } + } + Ok(true) + } + + pub fn other_context_by_line( + &mut self, + buf: &[u8], + upto: usize, + ) -> Result { + let range = Range::new(self.last_line_visited, upto); + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + range.start(), + range.end(), + ); + while let Some(line) = stepper.next_match(buf) { + if !self.sink_other_context(buf, &line)? { + return Ok(false); + } + } + Ok(true) + } + + fn match_by_line_slow(&mut self, buf: &[u8]) -> Result { + debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); + + let range = Range::new(self.pos(), buf.len()); + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + range.start(), + range.end(), + ); + while let Some(line) = stepper.next_match(buf) { + let matched = { + // Stripping the line terminator is necessary to prevent some + // classes of regexes from matching the empty position *after* + // the end of the line. For example, `(?m)^$` will match at + // position (2, 2) in the string `a\n`. + let slice = lines::without_terminator( + &buf[line], + self.config.line_term, + ); + match self.matcher.shortest_match(slice) { + Err(err) => return Err(S::Error::error_message(err)), + Ok(result) => result.is_some(), + } + }; + self.set_pos(line.end()); + if matched != self.config.invert_match { + if !self.before_context_by_line(buf, line.start())? { + return Ok(false); + } + if !self.sink_matched(buf, &line)? { + return Ok(false); + } + } else if self.after_context_left >= 1 { + if !self.sink_after_context(buf, &line)? { + return Ok(false); + } + } else if self.config.passthru { + if !self.sink_other_context(buf, &line)? { + return Ok(false); + } + } + } + Ok(true) + } + + fn match_by_line_fast(&mut self, buf: &[u8]) -> Result { + debug_assert!(!self.config.passthru); + + while !buf[self.pos()..].is_empty() { + if self.config.invert_match { + if !self.match_by_line_fast_invert(buf)? { + return Ok(false); + } + } else if let Some(line) = self.find_by_line_fast(buf)? { + if self.config.max_context() > 0 { + if !self.after_context_by_line(buf, line.start())? { + return Ok(false); + } + if !self.before_context_by_line(buf, line.start())? { + return Ok(false); + } + } + self.set_pos(line.end()); + if !self.sink_matched(buf, &line)? { + return Ok(false); + } + } else { + break; + } + } + if !self.after_context_by_line(buf, buf.len())? { + return Ok(false); + } + self.set_pos(buf.len()); + Ok(true) + } + + #[inline(always)] + fn match_by_line_fast_invert( + &mut self, + buf: &[u8], + ) -> Result { + assert!(self.config.invert_match); + + let invert_match = match self.find_by_line_fast(buf)? { + None => { + let range = Range::new(self.pos(), buf.len()); + self.set_pos(range.end()); + range + } + Some(line) => { + let range = Range::new(self.pos(), line.start()); + self.set_pos(line.end()); + range + } + }; + if invert_match.is_empty() { + return Ok(true); + } + if !self.after_context_by_line(buf, invert_match.start())? { + return Ok(false); + } + if !self.before_context_by_line(buf, invert_match.start())? { + return Ok(false); + } + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + invert_match.start(), + invert_match.end(), + ); + while let Some(line) = stepper.next_match(buf) { + if !self.sink_matched(buf, &line)? { + return Ok(false); + } + } + Ok(true) + } + + #[inline(always)] + fn find_by_line_fast( + &self, + buf: &[u8], + ) -> Result, S::Error> { + debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); + debug_assert!(self.is_line_by_line_fast()); + + let mut pos = self.pos(); + while !buf[pos..].is_empty() { + match self.matcher.find_candidate_line(&buf[pos..]) { + Err(err) => return Err(S::Error::error_message(err)), + Ok(None) => return Ok(None), + Ok(Some(LineMatchKind::Confirmed(i))) => { + let line = lines::locate( + buf, + self.config.line_term.as_byte(), + Range::zero(i).offset(pos), + ); + // If we matched beyond the end of the buffer, then we + // don't report this as a match. + if line.start() == buf.len() { + pos = buf.len(); + continue; + } + return Ok(Some(line)); + } + Ok(Some(LineMatchKind::Candidate(i))) => { + let line = lines::locate( + buf, + self.config.line_term.as_byte(), + Range::zero(i).offset(pos), + ); + // We need to strip the line terminator here to match the + // semantics of line-by-line searching. Namely, regexes + // like `(?m)^$` can match at the final position beyond a + // line terminator, which is non-sensical in line oriented + // matching. + let slice = lines::without_terminator( + &buf[line], + self.config.line_term, + ); + match self.matcher.is_match(slice) { + Err(err) => return Err(S::Error::error_message(err)), + Ok(true) => return Ok(Some(line)), + Ok(false) => { + pos = line.end(); + continue; + } + } + } + } + } + Ok(None) + } + + #[inline(always)] + fn sink_matched( + &mut self, + buf: &[u8], + range: &Range, + ) -> Result { + if self.binary && self.detect_binary(buf, range) { + return Ok(false); + } + if !self.sink_break_context(range.start())? { + return Ok(false); + } + self.count_lines(buf, range.start()); + let offset = self.absolute_byte_offset + range.start() as u64; + let linebuf = + if self.config.line_term.is_crlf() { + // Normally, a line terminator is never part of a match, but + // if the line terminator is CRLF, then it's possible for `\r` + // to end up in the match, which we generally don't want. So + // we strip it here. + lines::without_terminator(&buf[*range], self.config.line_term) + } else { + &buf[*range] + }; + let keepgoing = self.sink.matched( + &self.searcher, + &SinkMatch { + line_term: self.config.line_term, + bytes: linebuf, + absolute_byte_offset: offset, + line_number: self.line_number, + }, + )?; + if !keepgoing { + return Ok(false); + } + self.last_line_visited = range.end(); + self.after_context_left = self.config.after_context; + self.has_sunk = true; + Ok(true) + } + + fn sink_before_context( + &mut self, + buf: &[u8], + range: &Range, + ) -> Result { + if self.binary && self.detect_binary(buf, range) { + return Ok(false); + } + self.count_lines(buf, range.start()); + let offset = self.absolute_byte_offset + range.start() as u64; + let keepgoing = self.sink.context( + &self.searcher, + &SinkContext { + line_term: self.config.line_term, + bytes: &buf[*range], + kind: SinkContextKind::Before, + absolute_byte_offset: offset, + line_number: self.line_number, + }, + )?; + if !keepgoing { + return Ok(false); + } + self.last_line_visited = range.end(); + self.has_sunk = true; + Ok(true) + } + + fn sink_after_context( + &mut self, + buf: &[u8], + range: &Range, + ) -> Result { + assert!(self.after_context_left >= 1); + + if self.binary && self.detect_binary(buf, range) { + return Ok(false); + } + self.count_lines(buf, range.start()); + let offset = self.absolute_byte_offset + range.start() as u64; + let keepgoing = self.sink.context( + &self.searcher, + &SinkContext { + line_term: self.config.line_term, + bytes: &buf[*range], + kind: SinkContextKind::After, + absolute_byte_offset: offset, + line_number: self.line_number, + }, + )?; + if !keepgoing { + return Ok(false); + } + self.last_line_visited = range.end(); + self.after_context_left -= 1; + self.has_sunk = true; + Ok(true) + } + + fn sink_other_context( + &mut self, + buf: &[u8], + range: &Range, + ) -> Result { + if self.binary && self.detect_binary(buf, range) { + return Ok(false); + } + self.count_lines(buf, range.start()); + let offset = self.absolute_byte_offset + range.start() as u64; + let keepgoing = self.sink.context( + &self.searcher, + &SinkContext { + line_term: self.config.line_term, + bytes: &buf[*range], + kind: SinkContextKind::Other, + absolute_byte_offset: offset, + line_number: self.line_number, + }, + )?; + if !keepgoing { + return Ok(false); + } + self.last_line_visited = range.end(); + self.has_sunk = true; + Ok(true) + } + + fn sink_break_context( + &mut self, + start_of_line: usize, + ) -> Result { + let is_gap = self.last_line_visited < start_of_line; + let any_context = + self.config.before_context > 0 + || self.config.after_context > 0; + + if !any_context || !self.has_sunk || !is_gap { + Ok(true) + } else { + self.sink.context_break(&self.searcher) + } + } + + fn count_lines(&mut self, buf: &[u8], upto: usize) { + if let Some(ref mut line_number) = self.line_number { + if self.last_line_counted >= upto { + return; + } + let slice = &buf[self.last_line_counted..upto]; + let count = lines::count(slice, self.config.line_term.as_byte()); + *line_number += count; + self.last_line_counted = upto; + } + } + + fn is_line_by_line_fast(&self) -> bool { + debug_assert!(!self.searcher.multi_line_with_matcher(&self.matcher)); + + if self.config.passthru { + return false; + } + if let Some(line_term) = self.matcher.line_terminator() { + if line_term == self.config.line_term { + return true; + } + } + if let Some(non_matching) = self.matcher.non_matching_bytes() { + // If the line terminator is CRLF, we don't actually need to care + // whether the regex can match `\r` or not. Namely, a `\r` is + // neither necessary nor sufficient to terminate a line. A `\n` is + // always required. + if non_matching.contains(self.config.line_term.as_byte()) { + return true; + } + } + false + } +} diff --git a/grep-searcher/src/searcher/glue.rs b/grep-searcher/src/searcher/glue.rs new file mode 100644 index 000000000..3a5d42911 --- /dev/null +++ b/grep-searcher/src/searcher/glue.rs @@ -0,0 +1,1506 @@ +use std::cmp; +use std::io; + +use grep_matcher::Matcher; +use lines::{self, LineStep}; +use line_buffer::{DEFAULT_BUFFER_CAPACITY, LineBufferReader}; +use sink::{Sink, SinkError}; + +use searcher::{Config, Range, Searcher}; +use searcher::core::Core; + +#[derive(Debug)] +pub struct ReadByLine<'s, M: 's, R, S> { + config: &'s Config, + core: Core<'s, M, S>, + rdr: LineBufferReader<'s, R>, +} + +impl<'s, M, R, S> ReadByLine<'s, M, R, S> +where M: Matcher, + R: io::Read, + S: Sink +{ + pub fn new( + searcher: &'s Searcher, + matcher: M, + read_from: LineBufferReader<'s, R>, + write_to: S, + ) -> ReadByLine<'s, M, R, S> { + debug_assert!(!searcher.multi_line_with_matcher(&matcher)); + + ReadByLine { + config: &searcher.config, + core: Core::new(searcher, matcher, write_to, false), + rdr: read_from, + } + } + + pub fn run(mut self) -> Result<(), S::Error> { + if self.core.begin()? { + while + self.fill()? && self.core.match_by_line(self.rdr.buffer())? + {} + } + self.core.finish( + self.rdr.absolute_byte_offset(), + self.rdr.binary_byte_offset(), + ) + } + + fn fill(&mut self) -> Result { + assert!(self.rdr.buffer()[self.core.pos()..].is_empty()); + + let old_buf_len = self.rdr.buffer().len(); + let consumed = self.core.roll(self.rdr.buffer()); + self.rdr.consume(consumed); + let didread = match self.rdr.fill() { + Err(err) => return Err(S::Error::error_io(err)), + Ok(didread) => didread, + }; + if !didread || self.rdr.binary_byte_offset().is_some() { + return Ok(false); + } + // If rolling the buffer didn't result in consuming anything and if + // re-filling the buffer didn't add any bytes, then the only thing in + // our buffer is leftover context, which we no longer need since there + // is nothing left to search. So forcefully quit. + if consumed == 0 && old_buf_len == self.rdr.buffer().len() { + self.rdr.consume(old_buf_len); + return Ok(false); + } + Ok(true) + } +} + +#[derive(Debug)] +pub struct SliceByLine<'s, M: 's, S> { + config: &'s Config, + core: Core<'s, M, S>, + slice: &'s [u8], +} + +impl<'s, M: Matcher, S: Sink> SliceByLine<'s, M, S> { + pub fn new( + searcher: &'s Searcher, + matcher: M, + slice: &'s [u8], + write_to: S, + ) -> SliceByLine<'s, M, S> { + debug_assert!(!searcher.multi_line_with_matcher(&matcher)); + + SliceByLine { + config: &searcher.config, + core: Core::new(searcher, matcher, write_to, true), + slice: slice, + } + } + + pub fn run(mut self) -> Result<(), S::Error> { + if self.core.begin()? { + let binary_upto = cmp::min( + self.slice.len(), + DEFAULT_BUFFER_CAPACITY, + ); + let binary_range = Range::new(0, binary_upto); + if !self.core.detect_binary(self.slice, &binary_range) { + while + !self.slice[self.core.pos()..].is_empty() + && self.core.match_by_line(self.slice)? + {} + } + } + let byte_count = self.byte_count(); + let binary_byte_offset = self.core.binary_byte_offset(); + self.core.finish(byte_count, binary_byte_offset) + } + + fn byte_count(&mut self) -> u64 { + match self.core.binary_byte_offset() { + Some(offset) if offset < self.core.pos() as u64 => offset, + _ => self.core.pos() as u64, + } + } +} + +#[derive(Debug)] +pub struct MultiLine<'s, M: 's, S> { + config: &'s Config, + core: Core<'s, M, S>, + slice: &'s [u8], + last_match: Option, +} + +impl<'s, M: Matcher, S: Sink> MultiLine<'s, M, S> { + pub fn new( + searcher: &'s Searcher, + matcher: M, + slice: &'s [u8], + write_to: S, + ) -> MultiLine<'s, M, S> { + debug_assert!(searcher.multi_line_with_matcher(&matcher)); + + MultiLine { + config: &searcher.config, + core: Core::new(searcher, matcher, write_to, true), + slice: slice, + last_match: None, + } + } + + pub fn run(mut self) -> Result<(), S::Error> { + if self.core.begin()? { + let binary_upto = cmp::min( + self.slice.len(), + DEFAULT_BUFFER_CAPACITY, + ); + let binary_range = Range::new(0, binary_upto); + if !self.core.detect_binary(self.slice, &binary_range) { + let mut keepgoing = true; + while !self.slice[self.core.pos()..].is_empty() && keepgoing { + keepgoing = self.sink()?; + } + if keepgoing { + keepgoing = match self.last_match.take() { + None => true, + Some(last_match) => { + if self.sink_context(&last_match)? { + self.sink_matched(&last_match)?; + } + true + } + }; + } + // Take care of any remaining context after the last match. + if keepgoing { + if self.config.passthru { + self.core.other_context_by_line( + self.slice, + self.slice.len(), + )?; + } else { + self.core.after_context_by_line( + self.slice, + self.slice.len(), + )?; + } + } + } + } + let byte_count = self.byte_count(); + let binary_byte_offset = self.core.binary_byte_offset(); + self.core.finish(byte_count, binary_byte_offset) + } + + fn sink(&mut self) -> Result { + if self.config.invert_match { + return self.sink_matched_inverted(); + } + let mat = match self.find()? { + Some(range) => range, + None => { + self.core.set_pos(self.slice.len()); + return Ok(true); + } + }; + self.advance(&mat); + + let line = lines::locate( + self.slice, + self.config.line_term.as_byte(), + mat, + ); + // We delay sinking the match to make sure we group adjacent matches + // together in a single sink. Adjacent matches are distinct matches + // that start and end on the same line, respectively. This guarantees + // that a single line is never sinked more than once. + match self.last_match.take() { + None => { + self.last_match = Some(line); + Ok(true) + } + Some(last_match) => { + // If the lines in the previous match overlap with the lines + // in this match, then simply grow the match and move on. + // This happens when the next match begins on the same line + // that the last match ends on. + if last_match.end() > line.start() { + self.last_match = Some(last_match.with_end(line.end())); + Ok(true) + } else { + self.last_match = Some(line); + if !self.sink_context(&last_match)? { + return Ok(false); + } + self.sink_matched(&last_match) + } + } + } + } + + fn sink_matched_inverted(&mut self) -> Result { + assert!(self.config.invert_match); + + let invert_match = match self.find()? { + None => { + let range = Range::new(self.core.pos(), self.slice.len()); + self.core.set_pos(range.end()); + range + } + Some(mat) => { + let line = lines::locate( + self.slice, + self.config.line_term.as_byte(), + mat, + ); + let range = Range::new(self.core.pos(), line.start()); + self.advance(&line); + range + } + }; + if invert_match.is_empty() { + return Ok(true); + } + if !self.sink_context(&invert_match)? { + return Ok(false); + } + let mut stepper = LineStep::new( + self.config.line_term.as_byte(), + invert_match.start(), + invert_match.end(), + ); + while let Some(line) = stepper.next_match(self.slice) { + if !self.sink_matched(&line)? { + return Ok(false); + } + } + Ok(true) + } + + fn sink_matched(&mut self, range: &Range) -> Result { + if range.is_empty() { + // The only way we can produce an empty line for a match is if we + // match the position immediately following the last byte that we + // search, and where that last byte is also the line terminator. We + // never want to report that match, and we know we're done at that + // point anyway, so stop the search. + return Ok(false); + } + self.core.matched(self.slice, range) + } + + fn sink_context(&mut self, range: &Range) -> Result { + if self.config.passthru { + if !self.core.other_context_by_line(self.slice, range.start())? { + return Ok(false); + } + } else { + if !self.core.after_context_by_line(self.slice, range.start())? { + return Ok(false); + } + if !self.core.before_context_by_line(self.slice, range.start())? { + return Ok(false); + } + } + Ok(true) + } + + fn find(&mut self) -> Result, S::Error> { + match self.core.matcher().find(&self.slice[self.core.pos()..]) { + Err(err) => Err(S::Error::error_message(err)), + Ok(None) => Ok(None), + Ok(Some(m)) => Ok(Some(m.offset(self.core.pos()))), + } + } + + /// Advance the search position based on the previous match. + /// + /// If the previous match is zero width, then this advances the search + /// position one byte past the end of the match. + fn advance(&mut self, range: &Range) { + self.core.set_pos(range.end()); + if range.is_empty() && self.core.pos() < self.slice.len() { + let newpos = self.core.pos() + 1; + self.core.set_pos(newpos); + } + } + + fn byte_count(&mut self) -> u64 { + match self.core.binary_byte_offset() { + Some(offset) if offset < self.core.pos() as u64 => offset, + _ => self.core.pos() as u64, + } + } +} + +#[cfg(test)] +mod tests { + use searcher::{BinaryDetection, SearcherBuilder}; + use testutil::{KitchenSink, RegexMatcher, SearcherTester}; + + use super::*; + + const SHERLOCK: &'static str = "\ +For the Doctor Watsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ +"; + + const CODE: &'static str = "\ +extern crate snap; + +use std::io; + +fn main() { + let stdin = io::stdin(); + let stdout = io::stdout(); + + // Wrap the stdin reader in a Snappy reader. + let mut rdr = snap::Reader::new(stdin.lock()); + let mut wtr = stdout.lock(); + io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); +} +"; + + #[test] + fn basic1() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn basic2() { + let exp = "\nbyte count:366\n"; + SearcherTester::new(SHERLOCK, "NADA") + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn basic3() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "a") + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn basic4() { + let haystack = "\ +a +b + +c + + +d +"; + let byte_count = haystack.len(); + let exp = format!("0:a\n\nbyte count:{}\n", byte_count); + SearcherTester::new(haystack, "a") + .line_number(false) + .expected_no_line_number(&exp) + .test(); + } + + #[test] + fn invert1() { + let exp = "\ +65:Holmeses, success in the province of detective work must always +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn line_number1() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + let exp_line = "\ +1:0:For the Doctor Watsons of this world, as opposed to the Sherlock +3:129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .expected_no_line_number(exp) + .expected_with_line_number(exp_line) + .test(); + } + + #[test] + fn line_number_invert1() { + let exp = "\ +65:Holmeses, success in the province of detective work must always +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_line = "\ +2:65:Holmeses, success in the province of detective work must always +4:193:can extract a clew from a wisp of straw or a flake of cigar ash; +5:258:but Doctor Watson has to have it taken out for him and dusted, +6:321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .invert_match(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_line) + .test(); + } + + #[test] + fn multi_line_overlap1() { + let haystack = "xxx\nabc\ndefxxxabc\ndefxxx\nxxx"; + let byte_count = haystack.len(); + let exp = format!( + "4:abc\n8:defxxxabc\n18:defxxx\n\nbyte count:{}\n", + byte_count); + + SearcherTester::new(haystack, "abc\ndef") + .by_line(false) + .line_number(false) + .expected_no_line_number(&exp) + .test(); + } + + #[test] + fn multi_line_overlap2() { + let haystack = "xxx\nabc\ndefabc\ndefxxx\nxxx"; + let byte_count = haystack.len(); + let exp = format!( + "4:abc\n8:defabc\n15:defxxx\n\nbyte count:{}\n", + byte_count); + + SearcherTester::new(haystack, "abc\ndef") + .by_line(false) + .line_number(false) + .expected_no_line_number(&exp) + .test(); + } + + #[test] + fn empty_line1() { + let exp = "\nbyte count:0\n"; + SearcherTester::new("", r"^$") + .expected_no_line_number(exp) + .expected_with_line_number(exp) + .test(); + } + + #[test] + fn empty_line2() { + let exp = "0:\n\nbyte count:1\n"; + let exp_line = "1:0:\n\nbyte count:1\n"; + + SearcherTester::new("\n", r"^$") + .expected_no_line_number(exp) + .expected_with_line_number(exp_line) + .test(); + } + + #[test] + fn empty_line3() { + let exp = "0:\n1:\n\nbyte count:2\n"; + let exp_line = "1:0:\n2:1:\n\nbyte count:2\n"; + + SearcherTester::new("\n\n", r"^$") + .expected_no_line_number(exp) + .expected_with_line_number(exp_line) + .test(); + } + + #[test] + fn empty_line4() { + // See: https://github.com/BurntSushi/ripgrep/issues/441 + let haystack = "\ +a +b + +c + + +d +"; + let byte_count = haystack.len(); + let exp = format!("4:\n7:\n8:\n\nbyte count:{}\n", byte_count); + let exp_line = format!( + "3:4:\n5:7:\n6:8:\n\nbyte count:{}\n", + byte_count); + + SearcherTester::new(haystack, r"^$") + .expected_no_line_number(&exp) + .expected_with_line_number(&exp_line) + .test(); + } + + #[test] + fn empty_line5() { + // See: https://github.com/BurntSushi/ripgrep/issues/441 + // This is like empty_line4, but lacks the trailing line terminator. + let haystack = "\ +a +b + +c + + +d"; + let byte_count = haystack.len(); + let exp = format!("4:\n7:\n8:\n\nbyte count:{}\n", byte_count); + let exp_line = format!( + "3:4:\n5:7:\n6:8:\n\nbyte count:{}\n", + byte_count); + + SearcherTester::new(haystack, r"^$") + .expected_no_line_number(&exp) + .expected_with_line_number(&exp_line) + .test(); + } + + #[test] + fn empty_line6() { + // See: https://github.com/BurntSushi/ripgrep/issues/441 + // This is like empty_line4, but includes an empty line at the end. + let haystack = "\ +a +b + +c + + +d + +"; + let byte_count = haystack.len(); + let exp = format!( + "4:\n7:\n8:\n11:\n\nbyte count:{}\n", + byte_count); + let exp_line = format!( + "3:4:\n5:7:\n6:8:\n8:11:\n\nbyte count:{}\n", + byte_count); + + SearcherTester::new(haystack, r"^$") + .expected_no_line_number(&exp) + .expected_with_line_number(&exp_line) + .test(); + } + + #[test] + fn big1() { + let mut haystack = String::new(); + haystack.push_str("a\n"); + // Pick an arbitrary number above the capacity. + for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { + haystack.push_str("zzz\n"); + } + haystack.push_str("a\n"); + + let byte_count = haystack.len(); + let exp = format!("0:a\n131186:a\n\nbyte count:{}\n", byte_count); + + SearcherTester::new(&haystack, "a") + .line_number(false) + .expected_no_line_number(&exp) + .test(); + } + + #[test] + fn big_error_one_line() { + let mut haystack = String::new(); + haystack.push_str("a\n"); + // Pick an arbitrary number above the capacity. + for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { + haystack.push_str("zzz\n"); + } + haystack.push_str("a\n"); + + let matcher = RegexMatcher::new("a"); + let mut sink = KitchenSink::new(); + let mut searcher = SearcherBuilder::new() + .heap_limit(Some(3)) // max line length is 4, one byte short + .build(); + let result = searcher.search_reader( + &matcher, + haystack.as_bytes(), + &mut sink, + ); + assert!(result.is_err()); + } + + #[test] + fn big_error_multi_line() { + let mut haystack = String::new(); + haystack.push_str("a\n"); + // Pick an arbitrary number above the capacity. + for _ in 0..(4 * (DEFAULT_BUFFER_CAPACITY + 7)) { + haystack.push_str("zzz\n"); + } + haystack.push_str("a\n"); + + let matcher = RegexMatcher::new("a"); + let mut sink = KitchenSink::new(); + let mut searcher = SearcherBuilder::new() + .multi_line(true) + .heap_limit(Some(haystack.len())) // actually need one more byte + .build(); + let result = searcher.search_reader( + &matcher, + haystack.as_bytes(), + &mut sink, + ); + assert!(result.is_err()); + } + + #[test] + fn binary1() { + let haystack = "\x00a"; + let exp = "\nbyte count:0\nbinary offset:0\n"; + + SearcherTester::new(haystack, "a") + .binary_detection(BinaryDetection::quit(0)) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn binary2() { + let haystack = "a\x00"; + let exp = "\nbyte count:0\nbinary offset:1\n"; + + SearcherTester::new(haystack, "a") + .binary_detection(BinaryDetection::quit(0)) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn binary3() { + let mut haystack = String::new(); + haystack.push_str("a\n"); + for _ in 0..DEFAULT_BUFFER_CAPACITY { + haystack.push_str("zzz\n"); + } + haystack.push_str("a\n"); + haystack.push_str("a\x00a\n"); + haystack.push_str("a\n"); + + // The line buffered searcher has slightly different semantics here. + // Namely, it will *always* detect binary data in the current buffer + // before searching it. Thus, the total number of bytes searched is + // smaller than below. + let exp = "0:a\n\nbyte count:32770\nbinary offset:32773\n"; + // In contrast, the slice readers (for multi line as well) will only + // look for binary data in the initial chunk of bytes. After that + // point, it only looks for binary data in matches. Note though that + // the binary offset remains the same. (See the binary4 test for a case + // where the offset is explicitly different.) + let exp_slice = + "0:a\n32770:a\n\nbyte count:32773\nbinary offset:32773\n"; + + SearcherTester::new(&haystack, "a") + .binary_detection(BinaryDetection::quit(0)) + .line_number(false) + .auto_heap_limit(false) + .expected_no_line_number(exp) + .expected_slice_no_line_number(exp_slice) + .test(); + } + + #[test] + fn binary4() { + let mut haystack = String::new(); + haystack.push_str("a\n"); + for _ in 0..DEFAULT_BUFFER_CAPACITY { + haystack.push_str("zzz\n"); + } + haystack.push_str("a\n"); + // The Read searcher will detect binary data here, but since this is + // beyond the initial buffer size and doesn't otherwise contain a + // match, the Slice reader won't detect the binary data until the next + // line (which is a match). + haystack.push_str("b\x00b\n"); + haystack.push_str("a\x00a\n"); + haystack.push_str("a\n"); + + let exp = "0:a\n\nbyte count:32770\nbinary offset:32773\n"; + // The binary offset for the Slice readers corresponds to the binary + // data in `a\x00a\n` since the first line with binary data + // (`b\x00b\n`) isn't part of a match, and is therefore undetected. + let exp_slice = + "0:a\n32770:a\n\nbyte count:32777\nbinary offset:32777\n"; + + SearcherTester::new(&haystack, "a") + .binary_detection(BinaryDetection::quit(0)) + .line_number(false) + .auto_heap_limit(false) + .expected_no_line_number(exp) + .expected_slice_no_line_number(exp_slice) + .test(); + } + + #[test] + fn passthru_sherlock1() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .passthru(true) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn passthru_sherlock_invert1() { + let exp = "\ +0-For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .passthru(true) + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock1() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; + +byte count:366 +"; + let exp_lines = "\ +1:0:For the Doctor Watsons of this world, as opposed to the Sherlock +2-65-Holmeses, success in the province of detective work must always +3:129:be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; + +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(1) + .before_context(1) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(1) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .before_context(1) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock_invert1() { + let exp = "\ +0-For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +1-0-For the Doctor Watsons of this world, as opposed to the Sherlock +2:65:Holmeses, success in the province of detective work must always +3-129-be, to a very large extent, the result of luck. Sherlock Holmes +4:193:can extract a clew from a wisp of straw or a flake of cigar ash; +5:258:but Doctor Watson has to have it taken out for him and dusted, +6:321:and exhibited clearly, with a label attached. +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(1) + .before_context(1) + .line_number(true) + .invert_match(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // before + SearcherTester::new(SHERLOCK, "Sherlock") + .before_context(1) + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + + // after + let exp = "\ +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(1) + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock2() { + let exp = "\ +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +2-65-Holmeses, success in the province of detective work must always +3:129:be, to a very large extent, the result of luck. Sherlock Holmes +4:193:can extract a clew from a wisp of straw or a flake of cigar ash; +5-258-but Doctor Watson has to have it taken out for him and dusted, +6:321:and exhibited clearly, with a label attached. +byte count:366 +"; + // before + after + line numbers + SearcherTester::new(SHERLOCK, " a ") + .after_context(1) + .before_context(1) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // before + SearcherTester::new(SHERLOCK, " a ") + .before_context(1) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // after + let exp = "\ +129:be, to a very large extent, the result of luck. Sherlock Holmes +193:can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, " a ") + .after_context(1) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock_invert2() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +1:0:For the Doctor Watsons of this world, as opposed to the Sherlock +2:65:Holmeses, success in the province of detective work must always +3-129-be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; +5:258:but Doctor Watson has to have it taken out for him and dusted, +6-321-and exhibited clearly, with a label attached. +byte count:366 +"; + // before + after + line numbers + SearcherTester::new(SHERLOCK, " a ") + .after_context(1) + .before_context(1) + .line_number(true) + .invert_match(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // before + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +-- +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, + +byte count:366 +"; + SearcherTester::new(SHERLOCK, " a ") + .before_context(1) + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + + // after + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +-- +258:but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, " a ") + .after_context(1) + .line_number(false) + .invert_match(true) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock3() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, + +byte count:366 +"; + let exp_lines = "\ +1:0:For the Doctor Watsons of this world, as opposed to the Sherlock +2-65-Holmeses, success in the province of detective work must always +3:129:be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; +5-258-but Doctor Watson has to have it taken out for him and dusted, + +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock4() { + let exp = "\ +129-be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +3-129-be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; +5:258:but Doctor Watson has to have it taken out for him and dusted, +6-321-and exhibited clearly, with a label attached. +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "dusted") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +258:but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "dusted") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +129-be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258:but Doctor Watson has to have it taken out for him and dusted, + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "dusted") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock5() { + let exp = "\ +0-For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +1-0-For the Doctor Watsons of this world, as opposed to the Sherlock +2:65:Holmeses, success in the province of detective work must always +3-129-be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; +5-258-but Doctor Watson has to have it taken out for him and dusted, +6:321:and exhibited clearly, with a label attached. +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "success|attached") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +65:Holmeses, success in the province of detective work must always +129-be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +-- +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "success|attached") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +0-For the Doctor Watsons of this world, as opposed to the Sherlock +65:Holmeses, success in the province of detective work must always +-- +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321:and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "success|attached") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_sherlock6() { + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + let exp_lines = "\ +1:0:For the Doctor Watsons of this world, as opposed to the Sherlock +2-65-Holmeses, success in the province of detective work must always +3:129:be, to a very large extent, the result of luck. Sherlock Holmes +4-193-can extract a clew from a wisp of straw or a flake of cigar ash; +5-258-but Doctor Watson has to have it taken out for him and dusted, +6-321-and exhibited clearly, with a label attached. +byte count:366 +"; + // before and after + line numbers + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(3) + .before_context(3) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes +193-can extract a clew from a wisp of straw or a flake of cigar ash; +258-but Doctor Watson has to have it taken out for him and dusted, +321-and exhibited clearly, with a label attached. +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .after_context(3) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +0:For the Doctor Watsons of this world, as opposed to the Sherlock +65-Holmeses, success in the province of detective work must always +129:be, to a very large extent, the result of luck. Sherlock Holmes + +byte count:366 +"; + SearcherTester::new(SHERLOCK, "Sherlock") + .before_context(3) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_code1() { + // before and after + let exp = "\ +33- +34-fn main() { +46: let stdin = io::stdin(); +75- let stdout = io::stdout(); +106- +107: // Wrap the stdin reader in a Snappy reader. +156: let mut rdr = snap::Reader::new(stdin.lock()); +207- let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + let exp_lines = "\ +4-33- +5-34-fn main() { +6:46: let stdin = io::stdin(); +7-75- let stdout = io::stdout(); +8-106- +9:107: // Wrap the stdin reader in a Snappy reader. +10:156: let mut rdr = snap::Reader::new(stdin.lock()); +11-207- let mut wtr = stdout.lock(); +12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + // before and after + line numbers + SearcherTester::new(CODE, "stdin") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +46: let stdin = io::stdin(); +75- let stdout = io::stdout(); +106- +107: // Wrap the stdin reader in a Snappy reader. +156: let mut rdr = snap::Reader::new(stdin.lock()); +207- let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + SearcherTester::new(CODE, "stdin") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +33- +34-fn main() { +46: let stdin = io::stdin(); +75- let stdout = io::stdout(); +106- +107: // Wrap the stdin reader in a Snappy reader. +156: let mut rdr = snap::Reader::new(stdin.lock()); + +byte count:307 +"; + SearcherTester::new(CODE, "stdin") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_code2() { + let exp = "\ +34-fn main() { +46- let stdin = io::stdin(); +75: let stdout = io::stdout(); +106- +107- // Wrap the stdin reader in a Snappy reader. +156- let mut rdr = snap::Reader::new(stdin.lock()); +207: let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); +305-} + +byte count:307 +"; + let exp_lines = "\ +5-34-fn main() { +6-46- let stdin = io::stdin(); +7:75: let stdout = io::stdout(); +8-106- +9-107- // Wrap the stdin reader in a Snappy reader. +10-156- let mut rdr = snap::Reader::new(stdin.lock()); +11:207: let mut wtr = stdout.lock(); +12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); +13-305-} + +byte count:307 +"; + // before and after + line numbers + SearcherTester::new(CODE, "stdout") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +75: let stdout = io::stdout(); +106- +107- // Wrap the stdin reader in a Snappy reader. +-- +207: let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); +305-} + +byte count:307 +"; + SearcherTester::new(CODE, "stdout") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +34-fn main() { +46- let stdin = io::stdin(); +75: let stdout = io::stdout(); +-- +107- // Wrap the stdin reader in a Snappy reader. +156- let mut rdr = snap::Reader::new(stdin.lock()); +207: let mut wtr = stdout.lock(); + +byte count:307 +"; + SearcherTester::new(CODE, "stdout") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn context_code3() { + let exp = "\ +20-use std::io; +33- +34:fn main() { +46- let stdin = io::stdin(); +75- let stdout = io::stdout(); +106- +107- // Wrap the stdin reader in a Snappy reader. +156: let mut rdr = snap::Reader::new(stdin.lock()); +207- let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + let exp_lines = "\ +3-20-use std::io; +4-33- +5:34:fn main() { +6-46- let stdin = io::stdin(); +7-75- let stdout = io::stdout(); +8-106- +9-107- // Wrap the stdin reader in a Snappy reader. +10:156: let mut rdr = snap::Reader::new(stdin.lock()); +11-207- let mut wtr = stdout.lock(); +12-240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + // before and after + line numbers + SearcherTester::new(CODE, "fn main|let mut rdr") + .after_context(2) + .before_context(2) + .line_number(true) + .expected_no_line_number(exp) + .expected_with_line_number(exp_lines) + .test(); + + // after + let exp = "\ +34:fn main() { +46- let stdin = io::stdin(); +75- let stdout = io::stdout(); +-- +156: let mut rdr = snap::Reader::new(stdin.lock()); +207- let mut wtr = stdout.lock(); +240- io::copy(&mut rdr, &mut wtr).expect(\"I/O operation failed\"); + +byte count:307 +"; + SearcherTester::new(CODE, "fn main|let mut rdr") + .after_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + + // before + let exp = "\ +20-use std::io; +33- +34:fn main() { +-- +106- +107- // Wrap the stdin reader in a Snappy reader. +156: let mut rdr = snap::Reader::new(stdin.lock()); + +byte count:307 +"; + SearcherTester::new(CODE, "fn main|let mut rdr") + .before_context(2) + .line_number(false) + .expected_no_line_number(exp) + .test(); + } + + #[test] + fn scratch() { + use sinks; + use testutil::RegexMatcher; + + const SHERLOCK: &'static [u8] = b"\ +For the Doctor Wat\xFFsons of this world, as opposed to the Sherlock +Holmeses, success in the province of detective work must always +be, to a very large extent, the result of luck. Sherlock Holmes +can extract a clew from a wisp of straw or a flake of cigar ash; +but Doctor Watson has to have it taken out for him and dusted, +and exhibited clearly, with a label attached.\ + "; + + let haystack = SHERLOCK; + let matcher = RegexMatcher::new("Sherlock"); + let mut searcher = SearcherBuilder::new() + .line_number(true) + .build(); + searcher.search_reader(&matcher, haystack, sinks::Lossy(|n, line| { + print!("{}:{}", n, line); + Ok(true) + })).unwrap(); + } +} diff --git a/grep-searcher/src/searcher/mmap.rs b/grep-searcher/src/searcher/mmap.rs new file mode 100644 index 000000000..85e0487fb --- /dev/null +++ b/grep-searcher/src/searcher/mmap.rs @@ -0,0 +1,106 @@ +use std::fs::File; +use std::path::Path; + +use memmap::Mmap; + +/// Controls the strategy used for determining when to use memory maps. +/// +/// If a searcher is called in circumstances where it is possible to use memory +/// maps, and memory maps are enabled, then it will attempt to do so if it +/// believes it will make the search faster. +/// +/// By default, memory maps are disabled. +#[derive(Clone, Debug)] +pub struct MmapChoice(MmapChoiceImpl); + +#[derive(Clone, Debug)] +enum MmapChoiceImpl { + Auto, + Never, +} + +impl Default for MmapChoice { + fn default() -> MmapChoice { + MmapChoice(MmapChoiceImpl::Never) + } +} + +impl MmapChoice { + /// Use memory maps when they are believed to be advantageous. + /// + /// The heuristics used to determine whether to use a memory map or not + /// may depend on many things, including but not limited to, file size + /// and platform. + /// + /// If memory maps are unavailable or cannot be used for a specific input, + /// then normal OS read calls are used instead. + /// + /// # Safety + /// + /// This constructor is not safe because there is no obvious way to + /// encapsulate the safety of file backed memory maps on all platforms + /// without simultaneously negating some or all of their benefits. + /// + /// The specific contract the caller is required to uphold isn't precise, + /// but it basically amounts to something like, "the caller guarantees that + /// the underlying file won't be mutated." This, of course, isn't feasible + /// in many environments. However, command line tools may still decide to + /// take the risk of, say, a `SIGBUS` occurring while attempting to read a + /// memory map. + pub unsafe fn auto() -> MmapChoice { + MmapChoice(MmapChoiceImpl::Auto) + } + + /// Never use memory maps, no matter what. This is the default. + pub fn never() -> MmapChoice { + MmapChoice(MmapChoiceImpl::Never) + } + + /// Return a memory map if memory maps are enabled and if creating a + /// memory from the given file succeeded and if memory maps are believed + /// to be advantageous for performance. + /// + /// If this does attempt to open a memory map and it fails, then `None` + /// is returned and the corresponding error (along with the file path, if + /// present) is logged at the debug level. + pub(crate) fn open( + &self, + file: &File, + path: Option<&Path>, + ) -> Option { + if !self.is_enabled() { + return None; + } + if cfg!(target_os = "macos") { + // I guess memory maps on macOS aren't great. Should re-evaluate. + return None; + } + // SAFETY: This is acceptable because the only way `MmapChoiceImpl` can + // be `Auto` is if the caller invoked the `auto` constructor, which + // is itself not safe. Thus, this is a propagation of the caller's + // assertion that using memory maps is safe. + match unsafe { Mmap::map(file) } { + Ok(mmap) => Some(mmap), + Err(err) => { + if let Some(path) = path { + debug!( + "{}: failed to open memory map: {}", + path.display(), + err + ); + } else { + debug!("failed to open memory map: {}", err); + } + None + } + } + } + + /// Whether this strategy may employ memory maps or not. + pub(crate) fn is_enabled(&self) -> bool { + match self.0 { + MmapChoiceImpl::Auto => true, + MmapChoiceImpl::Never => false, + } + } +} diff --git a/grep-searcher/src/searcher/mod.rs b/grep-searcher/src/searcher/mod.rs new file mode 100644 index 000000000..bc428b687 --- /dev/null +++ b/grep-searcher/src/searcher/mod.rs @@ -0,0 +1,956 @@ +use std::cell::RefCell; +use std::cmp; +use std::fmt; +use std::fs::File; +use std::io::{self, Read}; +use std::path::Path; + +use encoding_rs; +use encoding_rs_io::DecodeReaderBytesBuilder; +use grep_matcher::{LineTerminator, Match, Matcher}; +use line_buffer::{ + self, BufferAllocation, LineBuffer, LineBufferBuilder, LineBufferReader, + DEFAULT_BUFFER_CAPACITY, alloc_error, +}; +use searcher::glue::{ReadByLine, SliceByLine, MultiLine}; +use sink::{Sink, SinkError}; + +pub use self::mmap::MmapChoice; + +mod core; +mod glue; +mod mmap; + +/// We use this type alias since we want the ergonomics of a matcher's `Match` +/// type, but in practice, we use it for arbitrary ranges, so give it a more +/// accurate name. This is only used in the searcher's internals. +type Range = Match; + +/// The behavior of binary detection while searching. +/// +/// Binary detection is the process of _heuristically_ identifying whether a +/// given chunk of data is binary or not, and then taking an action based on +/// the result of that heuristic. The motivation behind detecting binary data +/// is that binary data often indicates data that is undesirable to search +/// using textual patterns. Of course, there are many cases in which this isn't +/// true, which is why binary detection is disabled by default. +/// +/// Unfortunately, binary detection works differently depending on the type of +/// search being executed: +/// +/// 1. When performing a search using a fixed size buffer, binary detection is +/// applied to the buffer's contents as it is filled. Binary detection must +/// be applied to the buffer directly because binary files may not contain +/// line terminators, which could result in exorbitant memory usage. +/// 2. When performing a search using memory maps or by reading data off the +/// heap, then binary detection is only guaranteed to be applied to the +/// parts corresponding to a match. When `Quit` is enabled, then the first +/// few KB of the data are searched for binary data. +#[derive(Clone, Debug, Default)] +pub struct BinaryDetection(line_buffer::BinaryDetection); + +impl BinaryDetection { + /// No binary detection is performed. Data reported by the searcher may + /// contain arbitrary bytes. + /// + /// This is the default. + pub fn none() -> BinaryDetection { + BinaryDetection(line_buffer::BinaryDetection::None) + } + + /// Binary detection is performed by looking for the given byte. + /// + /// When searching is performed using a fixed size buffer, then the + /// contents of that buffer are always searched for the presence of this + /// byte. If it is found, then the underlying data is considered binary + /// and the search stops as if it reached EOF. + /// + /// When searching is performed with the entire contents mapped into + /// memory, then binary detection is more conservative. Namely, only a + /// fixed sized region at the beginning of the contents are detected for + /// binary data. As a compromise, any subsequent matching (or context) + /// lines are also searched for binary data. If binary data is detected at + /// any point, then the search stops as if it reached EOF. + pub fn quit(binary_byte: u8) -> BinaryDetection { + BinaryDetection(line_buffer::BinaryDetection::Quit(binary_byte)) + } + + // TODO(burntsushi): Figure out how to make binary conversion work. This + // permits implementing GNU grep's default behavior, which is to zap NUL + // bytes but still execute a search (if a match is detected, then GNU grep + // stops and reports that a match was found but doesn't print the matching + // line itself). + // + // This behavior is pretty simple to implement using the line buffer (and + // in fact, it is already implemented and tested), since there's a fixed + // size buffer that we can easily write to. The issue arises when searching + // a `&[u8]` (whether on the heap or via a memory map), since this isn't + // something we can easily write to. + + /// The given byte is searched in all contents read by the line buffer. If + /// it occurs, then it is replaced by the line terminator. The line buffer + /// guarantees that this byte will never be observable by callers. + #[allow(dead_code)] + fn convert(binary_byte: u8) -> BinaryDetection { + BinaryDetection(line_buffer::BinaryDetection::Convert(binary_byte)) + } +} + +/// An encoding to use when searching. +/// +/// An encoding can be used to configure a +/// [`SearcherBuilder`](struct.SearchBuilder.html) +/// to transcode source data from an encoding to UTF-8 before searching. +/// +/// An `Encoding` will always be cheap to clone. +#[derive(Clone, Debug)] +pub struct Encoding(&'static encoding_rs::Encoding); + +impl Encoding { + /// Create a new encoding for the specified label. + /// + /// The encoding label provided is mapped to an encoding via the set of + /// available choices specified in the + /// [Encoding Standard](https://encoding.spec.whatwg.org/#concept-encoding-get). + /// If the given label does not correspond to a valid encoding, then this + /// returns an error. + pub fn new(label: &str) -> Result { + let label = label.as_bytes(); + match encoding_rs::Encoding::for_label_no_replacement(label) { + Some(encoding) => Ok(Encoding(encoding)), + None => { + Err(ConfigError::UnknownEncoding { label: label.to_vec() }) + } + } + } +} + +/// The internal configuration of a searcher. This is shared among several +/// search related types, but is only ever written to by the SearcherBuilder. +#[derive(Clone, Debug)] +pub struct Config { + /// The line terminator to use. + line_term: LineTerminator, + /// Whether to invert matching. + invert_match: bool, + /// The number of lines after a match to include. + after_context: usize, + /// The number of lines before a match to include. + before_context: usize, + /// Whether to enable unbounded context or not. + passthru: bool, + /// Whether to count line numbers. + line_number: bool, + /// The maximum amount of heap memory to use. + /// + /// When not given, no explicit limit is enforced. When set to `0`, then + /// only the memory map search strategy is available. + heap_limit: Option, + /// The memory map strategy. + mmap: MmapChoice, + /// The binary data detection strategy. + binary: BinaryDetection, + /// Whether to enable matching across multiple lines. + multi_line: bool, + /// An encoding that, when present, causes the searcher to transcode all + /// input from the encoding to UTF-8. + encoding: Option, +} + +impl Default for Config { + fn default() -> Config { + Config { + line_term: LineTerminator::default(), + invert_match: false, + after_context: 0, + before_context: 0, + passthru: false, + line_number: true, + heap_limit: None, + mmap: MmapChoice::default(), + binary: BinaryDetection::default(), + multi_line: false, + encoding: None, + } + } +} + +impl Config { + /// Return the maximal amount of lines needed to fulfill this + /// configuration's context. + /// + /// If this returns `0`, then no context is ever needed. + fn max_context(&self) -> usize { + cmp::max(self.before_context, self.after_context) + } + + /// Build a line buffer from this configuration. + fn line_buffer(&self) -> LineBuffer { + let mut builder = LineBufferBuilder::new(); + builder + .line_terminator(self.line_term.as_byte()) + .binary_detection(self.binary.0); + + if let Some(limit) = self.heap_limit { + let (capacity, additional) = + if limit <= DEFAULT_BUFFER_CAPACITY { + (limit, 0) + } else { + (DEFAULT_BUFFER_CAPACITY, limit - DEFAULT_BUFFER_CAPACITY) + }; + builder + .capacity(capacity) + .buffer_alloc(BufferAllocation::Error(additional)); + } + builder.build() + } +} + +/// An error that can occur when building a searcher. +/// +/// This error occurs when a non-sensical configuration is present when trying +/// to construct a `Searcher` from a `SearcherBuilder`. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum ConfigError { + /// Indicates that the heap limit configuration prevents all possible + /// search strategies from being used. For example, if the heap limit is + /// set to 0 and memory map searching is disabled or unavailable. + SearchUnavailable, + /// Occurs when a matcher reports a line terminator that is different than + /// the one configured in the searcher. + MismatchedLineTerminators { + /// The matcher's line terminator. + matcher: LineTerminator, + /// The searcher's line terminator. + searcher: LineTerminator, + }, + /// Occurs when no encoding could be found for a particular label. + UnknownEncoding { + /// The provided encoding label that could not be found. + label: Vec, + }, + /// Hints that destructuring should not be exhaustive. + /// + /// This enum may grow additional variants, so this makes sure clients + /// don't count on exhaustive matching. (Otherwise, adding a new variant + /// could break existing code.) + #[doc(hidden)] + __Nonexhaustive, +} + +impl ::std::error::Error for ConfigError { + fn description(&self) -> &str { "grep-searcher configuration error" } +} + +impl fmt::Display for ConfigError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match *self { + ConfigError::SearchUnavailable => { + write!(f, "grep config error: no available searchers") + } + ConfigError::MismatchedLineTerminators { matcher, searcher } => { + write!( + f, + "grep config error: mismatched line terminators, \ + matcher has {:?} but searcher has {:?}", + matcher, + searcher + ) + } + ConfigError::UnknownEncoding { ref label } => { + write!( + f, + "grep config error: unknown encoding: {}", + String::from_utf8_lossy(label), + ) + } + _ => panic!("BUG: unexpected variant found"), + } + } +} + +/// A builder for configuring a searcher. +/// +/// A search builder permits specifying the configuration of a searcher, +/// including options like whether to invert the search or to enable multi +/// line search. +/// +/// Once a searcher has been built, it is beneficial to reuse that searcher +/// for multiple searches, if possible. +#[derive(Clone, Debug)] +pub struct SearcherBuilder { + config: Config, +} + +impl Default for SearcherBuilder { + fn default() -> SearcherBuilder { + SearcherBuilder::new() + } +} + +impl SearcherBuilder { + /// Create a new searcher builder with a default configuration. + pub fn new() -> SearcherBuilder { + SearcherBuilder { + config: Config::default(), + } + } + + /// Build a searcher with the given matcher. + pub fn build(&self) -> Searcher { + let mut config = self.config.clone(); + if config.passthru { + config.before_context = 0; + config.after_context = 0; + } + let mut decode_builder = DecodeReaderBytesBuilder::new(); + decode_builder + .encoding(self.config.encoding.as_ref().map(|e| e.0)) + .utf8_passthru(true) + .bom_override(true); + Searcher { + config: config, + decode_builder: decode_builder, + decode_buffer: RefCell::new(vec![0; 8 * (1<<10)]), + line_buffer: RefCell::new(self.config.line_buffer()), + multi_line_buffer: RefCell::new(vec![]), + } + } + + /// Set the line terminator that is used by the searcher. + /// + /// When using a searcher, if the matcher provided has a line terminator + /// set, then it must be the same as this one. If they aren't, building + /// a searcher will return an error. + /// + /// By default, this is set to `b'\n'`. + pub fn line_terminator( + &mut self, + line_term: LineTerminator, + ) -> &mut SearcherBuilder { + self.config.line_term = line_term; + self + } + + /// Whether to invert matching, whereby lines that don't match are reported + /// instead of reporting lines that do match. + /// + /// By default, this is disabled. + pub fn invert_match(&mut self, yes: bool) -> &mut SearcherBuilder { + self.config.invert_match = yes; + self + } + + /// Whether to count and include line numbers with matching lines. + /// + /// This is enabled by default. There is a small performance penalty + /// associated with computing line numbers, so this can be disabled when + /// this isn't desirable. + pub fn line_number(&mut self, yes: bool) -> &mut SearcherBuilder { + self.config.line_number = yes; + self + } + + /// Whether to enable multi line search or not. + /// + /// When multi line search is enabled, matches *may* match across multiple + /// lines. Conversely, when multi line search is disabled, it is impossible + /// for any match to span more than one line. + /// + /// **Warning:** multi line search requires having the entire contents to + /// search mapped in memory at once. When searching files, memory maps + /// will be used if possible and if they are enabled, which avoids using + /// your program's heap. However, if memory maps cannot be used (e.g., + /// for searching streams like `stdin` or if transcoding is necessary), + /// then the entire contents of the stream are read on to the heap before + /// starting the search. + /// + /// This is disabled by default. + pub fn multi_line(&mut self, yes: bool) -> &mut SearcherBuilder { + self.config.multi_line = yes; + self + } + + /// Whether to include a fixed number of lines after every match. + /// + /// When this is set to a non-zero number, then the searcher will report + /// `line_count` contextual lines after every match. + /// + /// This is set to `0` by default. + pub fn after_context( + &mut self, + line_count: usize, + ) -> &mut SearcherBuilder { + self.config.after_context = line_count; + self + } + + /// Whether to include a fixed number of lines before every match. + /// + /// When this is set to a non-zero number, then the searcher will report + /// `line_count` contextual lines before every match. + /// + /// This is set to `0` by default. + pub fn before_context( + &mut self, + line_count: usize, + ) -> &mut SearcherBuilder { + self.config.before_context = line_count; + self + } + + /// Whether to enable the "passthru" feature or not. + /// + /// When passthru is enabled, it effectively treats all non-matching lines + /// as contextual lines. In other words, enabling this is akin to + /// requesting an unbounded number of before and after contextual lines. + /// + /// When passthru mode is enabled, any `before_context` or `after_context` + /// settings are ignored by setting them to `0`. + /// + /// This is disabled by default. + pub fn passthru(&mut self, yes: bool) -> &mut SearcherBuilder { + self.config.passthru = yes; + self + } + + /// Set an approximate limit on the amount of heap space used by a + /// searcher. + /// + /// The heap limit is enforced in two scenarios: + /// + /// * When searching using a fixed size buffer, the heap limit controls + /// how big this buffer is allowed to be. Assuming contexts are disabled, + /// the minimum size of this buffer is the length (in bytes) of the + /// largest single line in the contents being searched. If any line + /// exceeds the heap limit, then an error will be returned. + /// * When performing a multi line search, a fixed size buffer cannot be + /// used. Thus, the only choices are to read the entire contents on to + /// the heap, or use memory maps. In the former case, the heap limit set + /// here is enforced. + /// + /// If a heap limit is set to `0`, then no heap space is used. If there are + /// no alternative strategies available for searching without heap space + /// (e.g., memory maps are disabled), then the searcher wil return an error + /// immediately. + /// + /// By default, no limit is set. + pub fn heap_limit( + &mut self, + bytes: Option, + ) -> &mut SearcherBuilder { + self.config.heap_limit = bytes; + self + } + + /// Set the strategy to employ use of memory maps. + /// + /// Currently, there are only two strategies that can be employed: + /// + /// * **Automatic** - A searcher will use heuristics, including but not + /// limited to file size and platform, to determine whether to use memory + /// maps or not. + /// * **Never** - Memory maps will never be used. If multi line search is + /// enabled, then the entire contents will be read on to the heap before + /// searching begins. + /// + /// The default behavior is **never**. Generally speaking, and perhaps + /// against conventional wisdom, memory maps don't necessarily enable + /// faster searching. For example, depending on the platform, using memory + /// maps while searching a large directory can actually be quite a bit + /// slower than using normal read calls because of the overhead of managing + /// the memory maps. + /// + /// Memory maps can be faster in some cases however. On some platforms, + /// when searching a very large file that *is already in memory*, it can + /// be slightly faster to search it as a memory map instead of using + /// normal read calls. + /// + /// Finally, memory maps have a somewhat complicated safety story in Rust. + /// If you aren't sure whether enabling memory maps is worth it, then just + /// don't bother with it. + /// + /// **WARNING**: If your process is searching a file backed memory map + /// at the same time that file is truncated, then it's possible for the + /// process to terminate with a bus error. + pub fn memory_map( + &mut self, + strategy: MmapChoice, + ) -> &mut SearcherBuilder { + self.config.mmap = strategy; + self + } + + /// Set the binary detection strategy. + /// + /// The binary detection strategy determines not only how the searcher + /// detects binary data, but how it responds to the presence of binary + /// data. See the [`BinaryDetection`](struct.BinaryDetection.html) type + /// for more information. + /// + /// By default, binary detection is disabled. + pub fn binary_detection( + &mut self, + detection: BinaryDetection, + ) -> &mut SearcherBuilder { + self.config.binary = detection; + self + } + + /// Set the encoding used to read the source data before searching. + /// + /// When an encoding is provided, then the source data is _unconditionally_ + /// transcoded using the encoding, unless a BOM is present. If a BOM is + /// present, then the encoding indicated by the BOM is used instead. If the + /// transcoding process encounters an error, then bytes are replaced with + /// the Unicode replacement codepoint. + /// + /// When no encoding is specified (the default), then BOM sniffing is used + /// to determine whether the source data is UTF-8 or UTF-16, and + /// transcoding will be performed automatically. If no BOM could be found, + /// then the source data is searched _as if_ it were UTF-8. However, so + /// long as the source data is at least ASCII compatible, then it is + /// possible for a search to produce useful results. + pub fn encoding( + &mut self, + encoding: Option, + ) -> &mut SearcherBuilder { + self.config.encoding = encoding; + self + } +} + +/// A searcher executes searches over a haystack and writes results to a caller +/// provided sink. +/// +/// Matches are detected via implementations of the `Matcher` trait, which must +/// be provided by the caller when executing a search. +/// +/// When possible, a searcher should be reused. +#[derive(Clone, Debug)] +pub struct Searcher { + /// The configuration for this searcher. + /// + /// We make most of these settings available to users of `Searcher` via + /// public API methods, which can be queried in implementations of `Sink` + /// if necessary. + config: Config, + /// A builder for constructing a streaming reader that transcodes source + /// data according to either an explicitly specified encoding or via an + /// automatically detected encoding via BOM sniffing. + /// + /// When no transcoding is needed, then the transcoder built will pass + /// through the underlying bytes with no additional overhead. + decode_builder: DecodeReaderBytesBuilder, + /// A buffer that is used for transcoding scratch space. + decode_buffer: RefCell>, + /// A line buffer for use in line oriented searching. + /// + /// We wrap it in a RefCell to permit lending out borrows of `Searcher` + /// to sinks. We still require a mutable borrow to execute a search, so + /// we statically prevent callers from causing RefCell to panic at runtime + /// due to a borrowing violation. + line_buffer: RefCell, + /// A buffer in which to store the contents of a reader when performing a + /// multi line search. In particular, multi line searches cannot be + /// performed incrementally, and need the entire haystack in memory at + /// once. + multi_line_buffer: RefCell>, +} + +impl Searcher { + /// Create a new searcher with a default configuration. + /// + /// To configure the searcher (e.g., invert matching, enable memory maps, + /// enable contexts, etc.), use the + /// [`SearcherBuilder`](struct.SearcherBuilder.html). + pub fn new() -> Searcher { + SearcherBuilder::new().build() + } + + /// Execute a search over the file with the given path and write the + /// results to the given sink. + /// + /// If memory maps are enabled and the searcher heuristically believes + /// memory maps will help the search run faster, then this will use + /// memory maps. For this reason, callers should prefer using this method + /// or `search_file` over the more generic `search_reader` when possible. + pub fn search_path( + &mut self, + matcher: M, + path: P, + write_to: S, + ) -> Result<(), S::Error> + where P: AsRef, + M: Matcher, + S: Sink, + { + let path = path.as_ref(); + let file = File::open(path).map_err(S::Error::error_io)?; + self.search_file_maybe_path(matcher, Some(path), &file, write_to) + } + + /// Execute a search over a file and write the results to the given sink. + /// + /// If memory maps are enabled and the searcher heuristically believes + /// memory maps will help the search run faster, then this will use + /// memory maps. For this reason, callers should prefer using this method + /// or `search_path` over the more generic `search_reader` when possible. + pub fn search_file( + &mut self, + matcher: M, + file: &File, + write_to: S, + ) -> Result<(), S::Error> + where M: Matcher, + S: Sink, + { + self.search_file_maybe_path(matcher, None, file, write_to) + } + + fn search_file_maybe_path( + &mut self, + matcher: M, + path: Option<&Path>, + file: &File, + write_to: S, + ) -> Result<(), S::Error> + where M: Matcher, + S: Sink, + { + if let Some(mmap) = self.config.mmap.open(file, path) { + trace!("{:?}: searching via memory map", path); + return self.search_slice(matcher, &mmap, write_to); + } + // Fast path for multi-line searches of files when memory maps are + // not enabled. This pre-allocates a buffer roughly the size of the + // file, which isn't possible when searching an arbitrary io::Read. + if self.multi_line_with_matcher(&matcher) { + trace!("{:?}: reading entire file on to heap for mulitline", path); + self.fill_multi_line_buffer_from_file::(file)?; + trace!("{:?}: searching via multiline strategy", path); + MultiLine::new( + self, + matcher, + &*self.multi_line_buffer.borrow(), + write_to, + ).run() + } else { + trace!("{:?}: searching using generic reader", path); + self.search_reader(matcher, file, write_to) + } + } + + /// Execute a search over any implementation of `io::Read` and write the + /// results to the given sink. + /// + /// When possible, this implementation will search the reader incrementally + /// without reading it into memory. In some cases---for example, if multi + /// line search is enabled---an incremental search isn't possible and the + /// given reader is consumed completely and placed on the heap before + /// searching begins. For this reason, when multi line search is enabled, + /// one should try to use higher level APIs (e.g., searching by file or + /// file path) so that memory maps can be used if they are available and + /// enabled. + pub fn search_reader( + &mut self, + matcher: M, + read_from: R, + write_to: S, + ) -> Result<(), S::Error> + where M: Matcher, + R: io::Read, + S: Sink, + { + self.check_config(&matcher).map_err(S::Error::error_config)?; + + let mut decode_buffer = self.decode_buffer.borrow_mut(); + let read_from = self.decode_builder + .build_with_buffer(read_from, &mut *decode_buffer) + .map_err(S::Error::error_io)?; + + if self.multi_line_with_matcher(&matcher) { + trace!("generic reader: reading everything to heap for multiline"); + self.fill_multi_line_buffer_from_reader::<_, S>(read_from)?; + trace!("generic reader: searching via multiline strategy"); + MultiLine::new( + self, + matcher, + &*self.multi_line_buffer.borrow(), + write_to, + ).run() + } else { + let mut line_buffer = self.line_buffer.borrow_mut(); + let rdr = LineBufferReader::new(read_from, &mut *line_buffer); + trace!("generic reader: searching via roll buffer strategy"); + ReadByLine::new(self, matcher, rdr, write_to).run() + } + } + + /// Execute a search over the given slice and write the results to the + /// given sink. + pub fn search_slice( + &mut self, + matcher: M, + slice: &[u8], + write_to: S, + ) -> Result<(), S::Error> + where M: Matcher, + S: Sink, + { + self.check_config(&matcher).map_err(S::Error::error_config)?; + + // We can search the slice directly, unless we need to do transcoding. + if self.slice_needs_transcoding(slice) { + trace!("slice reader: needs transcoding, using generic reader"); + return self.search_reader(matcher, slice, write_to); + } + if self.multi_line_with_matcher(&matcher) { + trace!("slice reader: searching via multiline strategy"); + MultiLine::new(self, matcher, slice, write_to).run() + } else { + trace!("slice reader: searching via slice-by-line strategy"); + SliceByLine::new(self, matcher, slice, write_to).run() + } + } + + /// Check that the searcher's configuration and the matcher are consistent + /// with each other. + fn check_config(&self, matcher: M) -> Result<(), ConfigError> { + if self.config.heap_limit == Some(0) + && !self.config.mmap.is_enabled() + { + return Err(ConfigError::SearchUnavailable); + } + let matcher_line_term = match matcher.line_terminator() { + None => return Ok(()), + Some(line_term) => line_term, + }; + if matcher_line_term != self.config.line_term { + return Err(ConfigError::MismatchedLineTerminators { + matcher: matcher_line_term, + searcher: self.config.line_term, + }); + } + Ok(()) + } + + /// Returns true if and only if the given slice needs to be transcoded. + fn slice_needs_transcoding(&self, slice: &[u8]) -> bool { + self.config.encoding.is_some() || slice_has_utf16_bom(slice) + } +} + +/// The following methods permit querying the configuration of a searcher. +/// These can be useful in generic implementations of +/// [`Sink`](trait.Sink.html), +/// where the output may be tailored based on how the searcher is configured. +impl Searcher { + /// Returns the line terminator used by this searcher. + #[inline] + pub fn line_terminator(&self) -> LineTerminator { + self.config.line_term + } + + /// Returns true if and only if this searcher is configured to invert its + /// search results. That is, matching lines are lines that do **not** match + /// the searcher's matcher. + #[inline] + pub fn invert_match(&self) -> bool { + self.config.invert_match + } + + /// Returns true if and only if this searcher is configured to count line + /// numbers. + #[inline] + pub fn line_number(&self) -> bool { + self.config.line_number + } + + /// Returns true if and only if this searcher is configured to perform + /// multi line search. + #[inline] + pub fn multi_line(&self) -> bool { + self.config.multi_line + } + + /// Returns true if and only if this searcher will choose a multi-line + /// strategy given the provided matcher. + /// + /// This may diverge from the result of `multi_line` in cases where the + /// searcher has been configured to execute a search that can report + /// matches over multiple lines, but where the matcher guarantees that it + /// will never produce a match over multiple lines. + pub fn multi_line_with_matcher(&self, matcher: M) -> bool { + if !self.multi_line() { + return false; + } + if let Some(line_term) = matcher.line_terminator() { + if line_term == self.line_terminator() { + return false; + } + } + if let Some(non_matching) = matcher.non_matching_bytes() { + // If the line terminator is CRLF, we don't actually need to care + // whether the regex can match `\r` or not. Namely, a `\r` is + // neither necessary nor sufficient to terminate a line. A `\n` is + // always required. + if non_matching.contains(self.line_terminator().as_byte()) { + return false; + } + } + true + } + + /// Returns the number of "after" context lines to report. When context + /// reporting is not enabled, this returns `0`. + #[inline] + pub fn after_context(&self) -> usize { + self.config.after_context + } + + /// Returns the number of "before" context lines to report. When context + /// reporting is not enabled, this returns `0`. + #[inline] + pub fn before_context(&self) -> usize { + self.config.before_context + } + + /// Returns true if and only if the searcher has "passthru" mode enabled. + #[inline] + pub fn passthru(&self) -> bool { + self.config.passthru + } + + /// Fill the buffer for use with multi-line searching from the given file. + /// This reads from the file until EOF or until an error occurs. If the + /// contents exceed the configured heap limit, then an error is returned. + fn fill_multi_line_buffer_from_file( + &self, + file: &File, + ) -> Result<(), S::Error> { + assert!(self.config.multi_line); + + let mut decode_buffer = self.decode_buffer.borrow_mut(); + let mut read_from = self.decode_builder + .build_with_buffer(file, &mut *decode_buffer) + .map_err(S::Error::error_io)?; + + // If we don't have a heap limit, then we can defer to std's + // read_to_end implementation. fill_multi_line_buffer_from_reader will + // do this too, but since we have a File, we can be a bit smarter about + // pre-allocating here. + // + // If we're transcoding, then our pre-allocation might not be exact, + // but is probably still better than nothing. + if self.config.heap_limit.is_none() { + let mut buf = self.multi_line_buffer.borrow_mut(); + buf.clear(); + let cap = file + .metadata() + .map(|m| m.len() as usize + 1) + .unwrap_or(0); + buf.reserve(cap); + read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?; + return Ok(()); + } + self.fill_multi_line_buffer_from_reader::<_, S>(read_from) + } + + /// Fill the buffer for use with multi-line searching from the given + /// reader. This reads from the reader until EOF or until an error occurs. + /// If the contents exceed the configured heap limit, then an error is + /// returned. + fn fill_multi_line_buffer_from_reader( + &self, + mut read_from: R, + ) -> Result<(), S::Error> { + assert!(self.config.multi_line); + + let mut buf = self.multi_line_buffer.borrow_mut(); + buf.clear(); + + // If we don't have a heap limit, then we can defer to std's + // read_to_end implementation... + let heap_limit = match self.config.heap_limit { + Some(heap_limit) => heap_limit, + None => { + read_from.read_to_end(&mut *buf).map_err(S::Error::error_io)?; + return Ok(()); + } + }; + if heap_limit == 0 { + return Err(S::Error::error_io(alloc_error(heap_limit))); + } + + // ... otherwise we need to roll our own. This is likely quite a bit + // slower than what is optimal, but we avoid worry about memory safety + // until there's a compelling reason to speed this up. + buf.resize(cmp::min(DEFAULT_BUFFER_CAPACITY, heap_limit), 0); + let mut pos = 0; + loop { + let nread = match read_from.read(&mut buf[pos..]) { + Ok(nread) => nread, + Err(ref err) if err.kind() == io::ErrorKind::Interrupted => { + continue; + } + Err(err) => return Err(S::Error::error_io(err)), + }; + if nread == 0 { + buf.resize(pos, 0); + return Ok(()); + } + + pos += nread; + if buf[pos..].is_empty() { + let additional = heap_limit - buf.len(); + if additional == 0 { + return Err(S::Error::error_io(alloc_error(heap_limit))); + } + let limit = buf.len() + additional; + let doubled = 2 * buf.len(); + buf.resize(cmp::min(doubled, limit), 0); + } + } + } +} + +/// Returns true if and only if the given slice begins with a UTF-16 BOM. +/// +/// This is used by the searcher to determine if a transcoder is necessary. +/// Otherwise, it is advantageous to search the slice directly. +fn slice_has_utf16_bom(slice: &[u8]) -> bool { + let enc = match encoding_rs::Encoding::for_bom(slice) { + None => return false, + Some((enc, _)) => enc, + }; + [encoding_rs::UTF_16LE, encoding_rs::UTF_16BE].contains(&enc) +} + +#[cfg(test)] +mod tests { + use testutil::{KitchenSink, RegexMatcher}; + use super::*; + + #[test] + fn config_error_heap_limit() { + let matcher = RegexMatcher::new(""); + let sink = KitchenSink::new(); + let mut searcher = SearcherBuilder::new() + .heap_limit(Some(0)) + .build(); + let res = searcher.search_slice(matcher, &[], sink); + assert!(res.is_err()); + } + + #[test] + fn config_error_line_terminator() { + let mut matcher = RegexMatcher::new(""); + matcher.set_line_term(Some(LineTerminator::byte(b'z'))); + + let sink = KitchenSink::new(); + let mut searcher = Searcher::new(); + let res = searcher.search_slice(matcher, &[], sink); + assert!(res.is_err()); + } +} diff --git a/grep-searcher/src/sink.rs b/grep-searcher/src/sink.rs new file mode 100644 index 000000000..2ed1e6c1e --- /dev/null +++ b/grep-searcher/src/sink.rs @@ -0,0 +1,559 @@ +use std::fmt; +use std::io; + +use grep_matcher::LineTerminator; + +use lines::LineIter; +use searcher::{ConfigError, Searcher}; + +/// A trait that describes errors that can be reported by searchers and +/// implementations of `Sink`. +/// +/// Unless you have a specialized use case, you probably don't need to +/// implement this trait explicitly. It's likely that using `io::Error` (which +/// implements this trait) for your error type is good enough, largely because +/// most errors that occur during search will likely be an `io::Error`. +pub trait SinkError: Sized { + /// A constructor for converting any value that satisfies the + /// `fmt::Display` trait into an error. + fn error_message(message: T) -> Self; + + /// A constructor for converting I/O errors that occur while searching into + /// an error of this type. + /// + /// By default, this is implemented via the `error_message` constructor. + fn error_io(err: io::Error) -> Self { + Self::error_message(err) + } + + /// A constructor for converting configuration errors that occur while + /// building a searcher into an error of this type. + /// + /// By default, this is implemented via the `error_message` constructor. + fn error_config(err: ConfigError) -> Self { + Self::error_message(err) + } +} + +/// An `io::Error` can be used as an error for `Sink` implementations out of +/// the box. +impl SinkError for io::Error { + fn error_message(message: T) -> io::Error { + io::Error::new(io::ErrorKind::Other, message.to_string()) + } + + fn error_io(err: io::Error) -> io::Error { + err + } +} + +/// A `Box` can be used as an error for `Sink` +/// implementations out of the box. +impl SinkError for Box<::std::error::Error> { + fn error_message(message: T) -> Box<::std::error::Error> { + Box::<::std::error::Error>::from(message.to_string()) + } +} + +/// A trait that defines how results from searchers are handled. +/// +/// In this crate, a searcher follows the "push" model. What that means is that +/// the searcher drives execution, and pushes results back to the caller. This +/// is in contrast to a "pull" model where the caller drives execution and +/// takes results as they need them. These are also known as "internal" and +/// "external" iteration strategies, respectively. +/// +/// For a variety of reasons, including the complexity of the searcher +/// implementation, this crate chooses the "push" or "internal" model of +/// execution. Thus, in order to act on search results, callers must provide +/// an implementation of this trait to a searcher, and the searcher is then +/// responsible for calling the methods on this trait. +/// +/// This trait defines several behaviors: +/// +/// * What to do when a match is found. Callers must provide this. +/// * What to do when an error occurs. Callers must provide this via the +/// [`SinkError`](trait.SinkError.html) trait. Generally, callers can just +/// use `io::Error` for this, which already implements `SinkError`. +/// * What to do when a contextual line is found. By default, these are +/// ignored. +/// * What to do when a gap between contextual lines has been found. By +/// default, this is ignored. +/// * What to do when a search has started. By default, this does nothing. +/// * What to do when a search has finished successfully. By default, this does +/// nothing. +/// +/// Callers must, at minimum, specify the behavior when an error occurs and +/// the behavior when a match occurs. The rest is optional. For each behavior, +/// callers may report an error (say, if writing the result to another +/// location failed) or simply return `false` if they want the search to stop +/// (e.g., when implementing a cap on the number of search results to show). +/// +/// When errors are reported (whether in the searcher or in the implementation +/// of `Sink`), then searchers quit immediately without calling `finish`. +/// +/// For simpler uses of `Sink`, callers may elect to use one of +/// the more convenient but less flexible implementations in the +/// [`sinks`](sinks/index.html) module. +pub trait Sink { + /// The type of an error that should be reported by a searcher. + /// + /// Errors of this type are not only returned by the methods on this + /// trait, but the constructors defined in `SinkError` are also used in + /// the searcher implementation itself. e.g., When a I/O error occurs when + /// reading data from a file. + type Error: SinkError; + + /// This method is called whenever a match is found. + /// + /// If multi line is enabled on the searcher, then the match reported here + /// may span multiple lines and it may include multiple matches. When multi + /// line is disabled, then the match is guaranteed to span exactly one + /// non-empty line (where a single line is, at minimum, a line terminator). + /// + /// If this returns `true`, then searching continues. If this returns + /// `false`, then searching is stopped immediately and `finish` is called. + /// + /// If this returns an error, then searching is stopped immediately, + /// `finish` is not called and the error is bubbled back up to the caller + /// of the searcher. + fn matched( + &mut self, + _searcher: &Searcher, + _mat: &SinkMatch, + ) -> Result; + + /// This method is called whenever a context line is found, and is optional + /// to implement. By default, it does nothing and returns `true`. + /// + /// In all cases, the context given is guaranteed to span exactly one + /// non-empty line (where a single line is, at minimum, a line terminator). + /// + /// If this returns `true`, then searching continues. If this returns + /// `false`, then searching is stopped immediately and `finish` is called. + /// + /// If this returns an error, then searching is stopped immediately, + /// `finish` is not called and the error is bubbled back up to the caller + /// of the searcher. + #[inline] + fn context( + &mut self, + _searcher: &Searcher, + _context: &SinkContext, + ) -> Result { + Ok(true) + } + + /// This method is called whenever a break in contextual lines is found, + /// and is optional to implement. By default, it does nothing and returns + /// `true`. + /// + /// A break can only occur when context reporting is enabled (that is, + /// either or both of `before_context` or `after_context` are greater than + /// `0`). More precisely, a break occurs between non-contiguous groups of + /// lines. + /// + /// If this returns `true`, then searching continues. If this returns + /// `false`, then searching is stopped immediately and `finish` is called. + /// + /// If this returns an error, then searching is stopped immediately, + /// `finish` is not called and the error is bubbled back up to the caller + /// of the searcher. + #[inline] + fn context_break( + &mut self, + _searcher: &Searcher, + ) -> Result { + Ok(true) + } + + /// This method is called when a search has begun, before any search is + /// executed. By default, this does nothing. + /// + /// If this returns `true`, then searching continues. If this returns + /// `false`, then searching is stopped immediately and `finish` is called. + /// + /// If this returns an error, then searching is stopped immediately, + /// `finish` is not called and the error is bubbled back up to the caller + /// of the searcher. + #[inline] + fn begin( + &mut self, + _searcher: &Searcher, + ) -> Result { + Ok(true) + } + + /// This method is called when a search has completed. By default, this + /// does nothing. + /// + /// If this returns an error, the error is bubbled back up to the caller of + /// the searcher. + #[inline] + fn finish( + &mut self, + _searcher: &Searcher, + _: &SinkFinish, + ) -> Result<(), Self::Error> { + Ok(()) + } +} + +impl<'a, S: Sink> Sink for &'a mut S { + type Error = S::Error; + + #[inline] + fn matched( + &mut self, + searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + (**self).matched(searcher, mat) + } + + #[inline] + fn context( + &mut self, + searcher: &Searcher, + context: &SinkContext, + ) -> Result { + (**self).context(searcher, context) + } + + #[inline] + fn context_break( + &mut self, + searcher: &Searcher, + ) -> Result { + (**self).context_break(searcher) + } + + #[inline] + fn begin( + &mut self, + searcher: &Searcher, + ) -> Result { + (**self).begin(searcher) + } + + #[inline] + fn finish( + &mut self, + searcher: &Searcher, + sink_finish: &SinkFinish, + ) -> Result<(), S::Error> { + (**self).finish(searcher, sink_finish) + } +} + +/// Summary data reported at the end of a search. +/// +/// This reports data such as the total number of bytes searched and the +/// absolute offset of the first occurrence of binary data, if any were found. +/// +/// A searcher that stops early because of an error does not call `finish`. +/// A searcher that stops early because the `Sink` implementor instructed it +/// to will still call `finish`. +#[derive(Clone, Debug)] +pub struct SinkFinish { + pub(crate) byte_count: u64, + pub(crate) binary_byte_offset: Option, +} + +impl SinkFinish { + /// Return the total number of bytes searched. + #[inline] + pub fn byte_count(&self) -> u64 { + self.byte_count + } + + /// If binary detection is enabled and if binary data was found, then this + /// returns the absolute byte offset of the first detected byte of binary + /// data. + /// + /// Note that since this is an absolute byte offset, it cannot be relied + /// upon to index into any addressable memory. + #[inline] + pub fn binary_byte_offset(&self) -> Option { + self.binary_byte_offset + } +} + +/// A type that describes a match reported by a searcher. +#[derive(Clone, Debug)] +pub struct SinkMatch<'b> { + pub(crate) line_term: LineTerminator, + pub(crate) bytes: &'b [u8], + pub(crate) absolute_byte_offset: u64, + pub(crate) line_number: Option, +} + +impl<'b> SinkMatch<'b> { + /// Returns the bytes for all matching lines, including the line + /// terminators, if they exist. + #[inline] + pub fn bytes(&self) -> &'b [u8] { + self.bytes + } + + /// Return an iterator over the lines in this match. + /// + /// If multi line search is enabled, then this may yield more than one + /// line (but always at least one line). If multi line search is disabled, + /// then this always reports exactly one line (but may consist of just + /// the line terminator). + /// + /// Lines yielded by this iterator include their terminators. + #[inline] + pub fn lines(&self) -> LineIter<'b> { + LineIter::new(self.line_term.as_byte(), self.bytes) + } + + /// Returns the absolute byte offset of the start of this match. This + /// offset is absolute in that it is relative to the very beginning of the + /// input in a search, and can never be relied upon to be a valid index + /// into an in-memory slice. + #[inline] + pub fn absolute_byte_offset(&self) -> u64 { + self.absolute_byte_offset + } + + /// Returns the line number of the first line in this match, if available. + /// + /// Line numbers are only available when the search builder is instructed + /// to compute them. + #[inline] + pub fn line_number(&self) -> Option { + self.line_number + } +} + +/// The type of context reported by a searcher. +#[derive(Clone, Debug, Eq, PartialEq)] +pub enum SinkContextKind { + /// The line reported occurred before a match. + Before, + /// The line reported occurred after a match. + After, + /// Any other type of context reported, e.g., as a result of a searcher's + /// "passthru" mode. + Other, +} + +/// A type that describes a contextual line reported by a searcher. +#[derive(Clone, Debug)] +pub struct SinkContext<'b> { + pub(crate) line_term: LineTerminator, + pub(crate) bytes: &'b [u8], + pub(crate) kind: SinkContextKind, + pub(crate) absolute_byte_offset: u64, + pub(crate) line_number: Option, +} + +impl<'b> SinkContext<'b> { + /// Returns the context bytes, including line terminators. + #[inline] + pub fn bytes(&self) -> &'b [u8] { + self.bytes + } + + /// Returns the type of context. + #[inline] + pub fn kind(&self) -> &SinkContextKind { + &self.kind + } + + /// Return an iterator over the lines in this match. + /// + /// This always yields exactly one line (and that one line may contain just + /// the line terminator). + /// + /// Lines yielded by this iterator include their terminators. + #[cfg(test)] + pub(crate) fn lines(&self) -> LineIter<'b> { + LineIter::new(self.line_term.as_byte(), self.bytes) + } + + /// Returns the absolute byte offset of the start of this context. This + /// offset is absolute in that it is relative to the very beginning of the + /// input in a search, and can never be relied upon to be a valid index + /// into an in-memory slice. + #[inline] + pub fn absolute_byte_offset(&self) -> u64 { + self.absolute_byte_offset + } + + /// Returns the line number of the first line in this context, if + /// available. + /// + /// Line numbers are only available when the search builder is instructed + /// to compute them. + #[inline] + pub fn line_number(&self) -> Option { + self.line_number + } +} + +/// A collection of convenience implementations of `Sink`. +/// +/// Each implementation in this module makes some kind of sacrifice in the name +/// of making common cases easier to use. Most frequently, each type is a +/// wrapper around a closure specified by the caller that provides limited +/// access to the full suite of information available to implementors of +/// `Sink`. +/// +/// For example, the `UTF8` sink makes the following sacrifices: +/// +/// * All matches must be UTF-8. An arbitrary `Sink` does not have this +/// restriction and can deal with arbitrary data. If this sink sees invalid +/// UTF-8, then an error is returned and searching stops. (Use the `Lossy` +/// sink instead to suppress this error.) +/// * The searcher must be configured to report line numbers. If it isn't, +/// an error is reported at the first match and searching stops. +/// * Context lines, context breaks and summary data reported at the end of +/// a search are all ignored. +/// * Implementors are forced to use `io::Error` as their error type. +/// +/// If you need more flexibility, then you're advised to implement the `Sink` +/// trait directly. +pub mod sinks { + use std::io; + use std::str; + + use searcher::Searcher; + use super::{Sink, SinkError, SinkMatch}; + + /// A sink that provides line numbers and matches as strings while ignoring + /// everything else. + /// + /// This implementation will return an error if a match contains invalid + /// UTF-8 or if the searcher was not configured to count lines. Errors + /// on invalid UTF-8 can be suppressed by using the `Lossy` sink instead + /// of this one. + /// + /// The closure accepts two parameters: a line number and a UTF-8 string + /// containing the matched data. The closure returns a + /// `Result`. If the `bool` is `false`, then the search + /// stops immediately. Otherwise, searching continues. + /// + /// If multi line mode was enabled, the line number refers to the line + /// number of the first line in the match. + #[derive(Clone, Debug)] + pub struct UTF8(pub F) + where F: FnMut(u64, &str) -> Result; + + impl Sink for UTF8 + where F: FnMut(u64, &str) -> Result + { + type Error = io::Error; + + fn matched( + &mut self, + _searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + let matched = match str::from_utf8(mat.bytes()) { + Ok(matched) => matched, + Err(err) => return Err(io::Error::error_message(err)), + }; + let line_number = match mat.line_number() { + Some(line_number) => line_number, + None => { + let msg = "line numbers not enabled"; + return Err(io::Error::error_message(msg)); + } + }; + (self.0)(line_number, &matched) + } + } + + /// A sink that provides line numbers and matches as (lossily converted) + /// strings while ignoring everything else. + /// + /// This is like `UTF8`, except that if a match contains invalid UTF-8, + /// then it will be lossily converted to valid UTF-8 by substituting + /// invalid UTF-8 with Unicode replacement characters. + /// + /// This implementation will return an error on the first match if the + /// searcher was not configured to count lines. + /// + /// The closure accepts two parameters: a line number and a UTF-8 string + /// containing the matched data. The closure returns a + /// `Result`. If the `bool` is `false`, then the search + /// stops immediately. Otherwise, searching continues. + /// + /// If multi line mode was enabled, the line number refers to the line + /// number of the first line in the match. + #[derive(Clone, Debug)] + pub struct Lossy(pub F) + where F: FnMut(u64, &str) -> Result; + + impl Sink for Lossy + where F: FnMut(u64, &str) -> Result + { + type Error = io::Error; + + fn matched( + &mut self, + _searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + use std::borrow::Cow; + + let matched = match str::from_utf8(mat.bytes()) { + Ok(matched) => Cow::Borrowed(matched), + // TODO: In theory, it should be possible to amortize + // allocation here, but `std` doesn't provide such an API. + // Regardless, this only happens on matches with invalid UTF-8, + // which should be pretty rare. + Err(_) => String::from_utf8_lossy(mat.bytes()), + }; + let line_number = match mat.line_number() { + Some(line_number) => line_number, + None => { + let msg = "line numbers not enabled"; + return Err(io::Error::error_message(msg)); + } + }; + (self.0)(line_number, &matched) + } + } + + /// A sink that provides line numbers and matches as raw bytes while + /// ignoring everything else. + /// + /// This implementation will return an error on the first match if the + /// searcher was not configured to count lines. + /// + /// The closure accepts two parameters: a line number and a raw byte string + /// containing the matched data. The closure returns a `Result`. If the `bool` is `false`, then the search stops + /// immediately. Otherwise, searching continues. + /// + /// If multi line mode was enabled, the line number refers to the line + /// number of the first line in the match. + #[derive(Clone, Debug)] + pub struct Bytes(pub F) + where F: FnMut(u64, &[u8]) -> Result; + + impl Sink for Bytes + where F: FnMut(u64, &[u8]) -> Result + { + type Error = io::Error; + + fn matched( + &mut self, + _searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + let line_number = match mat.line_number() { + Some(line_number) => line_number, + None => { + let msg = "line numbers not enabled"; + return Err(io::Error::error_message(msg)); + } + }; + (self.0)(line_number, mat.bytes()) + } + } +} diff --git a/grep-searcher/src/testutil.rs b/grep-searcher/src/testutil.rs new file mode 100644 index 000000000..b51508a1e --- /dev/null +++ b/grep-searcher/src/testutil.rs @@ -0,0 +1,787 @@ +use std::io::{self, Write}; +use std::str; + +use grep_matcher::{ + LineMatchKind, LineTerminator, Match, Matcher, NoCaptures, NoError, +}; +use memchr::memchr; +use regex::bytes::{Regex, RegexBuilder}; + +use searcher::{BinaryDetection, Searcher, SearcherBuilder}; +use sink::{Sink, SinkContext, SinkFinish, SinkMatch}; + +/// A simple regex matcher. +/// +/// This supports setting the matcher's line terminator configuration directly, +/// which we use for testing purposes. That is, the caller explicitly +/// determines whether the line terminator optimization is enabled. (In reality +/// this optimization is detected automatically by inspecting and possibly +/// modifying the regex itself.) +#[derive(Clone, Debug)] +pub struct RegexMatcher { + regex: Regex, + line_term: Option, + every_line_is_candidate: bool, +} + +impl RegexMatcher { + /// Create a new regex matcher. + pub fn new(pattern: &str) -> RegexMatcher { + let regex = RegexBuilder::new(pattern) + .multi_line(true) // permits ^ and $ to match at \n boundaries + .build() + .unwrap(); + RegexMatcher { + regex: regex, + line_term: None, + every_line_is_candidate: false, + } + } + + /// Forcefully set the line terminator of this matcher. + /// + /// By default, this matcher has no line terminator set. + pub fn set_line_term( + &mut self, + line_term: Option, + ) -> &mut RegexMatcher { + self.line_term = line_term; + self + } + + /// Whether to return every line as a candidate or not. + /// + /// This forces searchers to handle the case of reporting a false positive. + pub fn every_line_is_candidate( + &mut self, + yes: bool, + ) -> &mut RegexMatcher { + self.every_line_is_candidate = yes; + self + } +} + +impl Matcher for RegexMatcher { + type Captures = NoCaptures; + type Error = NoError; + + fn find_at( + &self, + haystack: &[u8], + at: usize, + ) -> Result, NoError> { + Ok(self.regex + .find_at(haystack, at) + .map(|m| Match::new(m.start(), m.end()))) + } + + fn new_captures(&self) -> Result { + Ok(NoCaptures::new()) + } + + fn line_terminator(&self) -> Option { + self.line_term + } + + fn find_candidate_line( + &self, + haystack: &[u8], + ) -> Result, NoError> { + if self.every_line_is_candidate { + assert!(self.line_term.is_some()); + if haystack.is_empty() { + return Ok(None); + } + // Make it interesting and return the last byte in the current + // line. + let i = memchr(self.line_term.unwrap().as_byte(), haystack) + .map(|i| i) + .unwrap_or(haystack.len() - 1); + Ok(Some(LineMatchKind::Candidate(i))) + } else { + Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed)) + } + } +} + +/// An implementation of Sink that prints all available information. +/// +/// This is useful for tests because it lets us easily confirm whether data +/// is being passed to Sink correctly. +#[derive(Clone, Debug)] +pub struct KitchenSink(Vec); + +impl KitchenSink { + /// Create a new implementation of Sink that includes everything in the + /// kitchen. + pub fn new() -> KitchenSink { + KitchenSink(vec![]) + } + + /// Return the data written to this sink. + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl Sink for KitchenSink { + type Error = io::Error; + + fn matched( + &mut self, + _searcher: &Searcher, + mat: &SinkMatch, + ) -> Result { + assert!(!mat.bytes().is_empty()); + assert!(mat.lines().count() >= 1); + + let mut line_number = mat.line_number(); + let mut byte_offset = mat.absolute_byte_offset(); + for line in mat.lines() { + if let Some(ref mut n) = line_number { + write!(self.0, "{}:", n)?; + *n += 1; + } + + write!(self.0, "{}:", byte_offset)?; + byte_offset += line.len() as u64; + self.0.write_all(line)?; + } + Ok(true) + } + + fn context( + &mut self, + _searcher: &Searcher, + context: &SinkContext, + ) -> Result { + assert!(!context.bytes().is_empty()); + assert!(context.lines().count() == 1); + + if let Some(line_number) = context.line_number() { + write!(self.0, "{}-", line_number)?; + } + write!(self.0, "{}-", context.absolute_byte_offset)?; + self.0.write_all(context.bytes())?; + Ok(true) + } + + fn context_break( + &mut self, + _searcher: &Searcher, + ) -> Result { + self.0.write_all(b"--\n")?; + Ok(true) + } + + fn finish( + &mut self, + _searcher: &Searcher, + sink_finish: &SinkFinish, + ) -> Result<(), io::Error> { + writeln!(self.0, "")?; + writeln!(self.0, "byte count:{}", sink_finish.byte_count())?; + if let Some(offset) = sink_finish.binary_byte_offset() { + writeln!(self.0, "binary offset:{}", offset)?; + } + Ok(()) + } +} + +/// A type for expressing tests on a searcher. +/// +/// The searcher code has a lot of different code paths, mostly for the +/// purposes of optimizing a bunch of different use cases. The intent of the +/// searcher is to pick the best code path based on the configuration, which +/// means there is no obviously direct way to ask that a specific code path +/// be exercised. Thus, the purpose of this tester is to explicitly check as +/// many code paths that make sense. +/// +/// The tester works by assuming you want to test all pertinent code paths. +/// These can be trimmed down as necessary via the various builder methods. +#[derive(Debug)] +pub struct SearcherTester { + haystack: String, + pattern: String, + filter: Option<::regex::Regex>, + print_labels: bool, + expected_no_line_number: Option, + expected_with_line_number: Option, + expected_slice_no_line_number: Option, + expected_slice_with_line_number: Option, + by_line: bool, + multi_line: bool, + invert_match: bool, + line_number: bool, + binary: BinaryDetection, + auto_heap_limit: bool, + after_context: usize, + before_context: usize, + passthru: bool, +} + +impl SearcherTester { + /// Create a new tester for testing searchers. + pub fn new(haystack: &str, pattern: &str) -> SearcherTester { + SearcherTester { + haystack: haystack.to_string(), + pattern: pattern.to_string(), + filter: None, + print_labels: false, + expected_no_line_number: None, + expected_with_line_number: None, + expected_slice_no_line_number: None, + expected_slice_with_line_number: None, + by_line: true, + multi_line: true, + invert_match: false, + line_number: true, + binary: BinaryDetection::none(), + auto_heap_limit: true, + after_context: 0, + before_context: 0, + passthru: false, + } + } + + /// Execute the test. If the test succeeds, then this returns successfully. + /// If the test fails, then it panics with an informative message. + pub fn test(&self) { + // Check for configuration errors. + if self.expected_no_line_number.is_none() { + panic!("an 'expected' string with NO line numbers must be given"); + } + if self.line_number && self.expected_with_line_number.is_none() { + panic!("an 'expected' string with line numbers must be given, \ + or disable testing with line numbers"); + } + + let configs = self.configs(); + if configs.is_empty() { + panic!("test configuration resulted in nothing being tested"); + } + if self.print_labels { + for config in &configs { + let labels = vec![ + format!("reader-{}", config.label), + format!("slice-{}", config.label), + ]; + for label in &labels { + if self.include(label) { + println!("{}", label); + } else { + println!("{} (ignored)", label); + } + } + } + } + for config in &configs { + let label = format!("reader-{}", config.label); + if self.include(&label) { + let got = config.search_reader(&self.haystack); + assert_eq_printed!(config.expected_reader, got, "{}", label); + } + + let label = format!("slice-{}", config.label); + if self.include(&label) { + let got = config.search_slice(&self.haystack); + assert_eq_printed!(config.expected_slice, got, "{}", label); + } + } + } + + /// Set a regex pattern to filter the tests that are run. + /// + /// By default, no filter is present. When a filter is set, only test + /// configurations with a label matching the given pattern will be run. + /// + /// This is often useful when debugging tests, e.g., when you want to do + /// printf debugging and only want one particular test configuration to + /// execute. + #[allow(dead_code)] + pub fn filter(&mut self, pattern: &str) -> &mut SearcherTester { + self.filter = Some(::regex::Regex::new(pattern).unwrap()); + self + } + + /// When set, the labels for all test configurations are printed before + /// executing any test. + /// + /// Note that in order to see these in tests that aren't failing, you'll + /// want to use `cargo test -- --nocapture`. + #[allow(dead_code)] + pub fn print_labels(&mut self, yes: bool) -> &mut SearcherTester { + self.print_labels = yes; + self + } + + /// Set the expected search results, without line numbers. + pub fn expected_no_line_number( + &mut self, + exp: &str, + ) -> &mut SearcherTester { + self.expected_no_line_number = Some(exp.to_string()); + self + } + + /// Set the expected search results, with line numbers. + pub fn expected_with_line_number( + &mut self, + exp: &str, + ) -> &mut SearcherTester { + self.expected_with_line_number = Some(exp.to_string()); + self + } + + /// Set the expected search results, without line numbers, when performing + /// a search on a slice. When not present, `expected_no_line_number` is + /// used instead. + pub fn expected_slice_no_line_number( + &mut self, + exp: &str, + ) -> &mut SearcherTester { + self.expected_slice_no_line_number = Some(exp.to_string()); + self + } + + /// Set the expected search results, with line numbers, when performing a + /// search on a slice. When not present, `expected_with_line_number` is + /// used instead. + #[allow(dead_code)] + pub fn expected_slice_with_line_number( + &mut self, + exp: &str, + ) -> &mut SearcherTester { + self.expected_slice_with_line_number = Some(exp.to_string()); + self + } + + /// Whether to test search with line numbers or not. + /// + /// This is enabled by default. When enabled, the string that is expected + /// when line numbers are present must be provided. Otherwise, the expected + /// string isn't required. + pub fn line_number(&mut self, yes: bool) -> &mut SearcherTester { + self.line_number = yes; + self + } + + /// Whether to test search using the line-by-line searcher or not. + /// + /// By default, this is enabled. + pub fn by_line(&mut self, yes: bool) -> &mut SearcherTester { + self.by_line = yes; + self + } + + /// Whether to test search using the multi line searcher or not. + /// + /// By default, this is enabled. + #[allow(dead_code)] + pub fn multi_line(&mut self, yes: bool) -> &mut SearcherTester { + self.multi_line = yes; + self + } + + /// Whether to perform an inverted search or not. + /// + /// By default, this is disabled. + pub fn invert_match(&mut self, yes: bool) -> &mut SearcherTester { + self.invert_match = yes; + self + } + + /// Whether to enable binary detection on all searches. + /// + /// By default, this is disabled. + pub fn binary_detection( + &mut self, + detection: BinaryDetection, + ) -> &mut SearcherTester { + self.binary = detection; + self + } + + /// Whether to automatically attempt to test the heap limit setting or not. + /// + /// By default, one of the test configurations includes setting the heap + /// limit to its minimal value for normal operation, which checks that + /// everything works even at the extremes. However, in some cases, the heap + /// limit can (expectedly) alter the output slightly. For example, it can + /// impact the number of bytes searched when performing binary detection. + /// For convenience, it can be useful to disable the automatic heap limit + /// test. + pub fn auto_heap_limit(&mut self, yes: bool) -> &mut SearcherTester { + self.auto_heap_limit = yes; + self + } + + /// Set the number of lines to include in the "after" context. + /// + /// The default is `0`, which is equivalent to not printing any context. + pub fn after_context(&mut self, lines: usize) -> &mut SearcherTester { + self.after_context = lines; + self + } + + /// Set the number of lines to include in the "before" context. + /// + /// The default is `0`, which is equivalent to not printing any context. + pub fn before_context(&mut self, lines: usize) -> &mut SearcherTester { + self.before_context = lines; + self + } + + /// Whether to enable the "passthru" feature or not. + /// + /// When passthru is enabled, it effectively treats all non-matching lines + /// as contextual lines. In other words, enabling this is akin to + /// requesting an unbounded number of before and after contextual lines. + /// + /// This is disabled by default. + pub fn passthru(&mut self, yes: bool) -> &mut SearcherTester { + self.passthru = yes; + self + } + + /// Return the minimum size of a buffer required for a successful search. + /// + /// Generally, this corresponds to the maximum length of a line (including + /// its terminator), but if context settings are enabled, then this must + /// include the sum of the longest N lines. + /// + /// Note that this must account for whether the test is using multi line + /// search or not, since multi line search requires being able to fit the + /// entire haystack into memory. + fn minimal_heap_limit(&self, multi_line: bool) -> usize { + if multi_line { + 1 + self.haystack.len() + } else if self.before_context == 0 && self.after_context == 0 { + 1 + self.haystack.lines().map(|s| s.len()).max().unwrap_or(0) + } else { + let mut lens: Vec = + self.haystack.lines().map(|s| s.len()).collect(); + lens.sort(); + lens.reverse(); + + let context_count = + if self.passthru { + self.haystack.lines().count() + } else { + // Why do we add 2 here? Well, we need to add 1 in order to + // have room to search at least one line. We add another + // because the implementation will occasionally include + // an additional line when handling the context. There's + // no particularly good reason, other than keeping the + // implementation simple. + 2 + self.before_context + self.after_context + }; + + // We add 1 to each line since `str::lines` doesn't include the + // line terminator. + lens.into_iter() + .take(context_count) + .map(|len| len + 1) + .sum::() + } + } + + /// Returns true if and only if the given label should be included as part + /// of executing `test`. + /// + /// Inclusion is determined by the filter specified. If no filter has been + /// given, then this always returns `true`. + fn include(&self, label: &str) -> bool { + let re = match self.filter { + None => return true, + Some(ref re) => re, + }; + re.is_match(label) + } + + /// Configs generates a set of all search configurations that should be + /// tested. The configs generated are based on the configuration in this + /// builder. + fn configs(&self) -> Vec { + let mut configs = vec![]; + + let matcher = RegexMatcher::new(&self.pattern); + let mut builder = SearcherBuilder::new(); + builder + .line_number(false) + .invert_match(self.invert_match) + .binary_detection(self.binary.clone()) + .after_context(self.after_context) + .before_context(self.before_context) + .passthru(self.passthru); + + if self.by_line { + let mut matcher = matcher.clone(); + let mut builder = builder.clone(); + + let expected_reader = + self.expected_no_line_number.as_ref().unwrap().to_string(); + let expected_slice = match self.expected_slice_no_line_number { + None => expected_reader.clone(), + Some(ref e) => e.to_string(), + }; + configs.push(TesterConfig { + label: "byline-noterm-nonumber".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + if self.auto_heap_limit { + builder.heap_limit(Some(self.minimal_heap_limit(false))); + configs.push(TesterConfig { + label: "byline-noterm-nonumber-heaplimit".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + builder.heap_limit(None); + } + + matcher.set_line_term(Some(LineTerminator::byte(b'\n'))); + configs.push(TesterConfig { + label: "byline-term-nonumber".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + matcher.every_line_is_candidate(true); + configs.push(TesterConfig { + label: "byline-term-nonumber-candidates".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + } + if self.by_line && self.line_number { + let mut matcher = matcher.clone(); + let mut builder = builder.clone(); + + let expected_reader = + self.expected_with_line_number.as_ref().unwrap().to_string(); + let expected_slice = match self.expected_slice_with_line_number { + None => expected_reader.clone(), + Some(ref e) => e.to_string(), + }; + + builder.line_number(true); + configs.push(TesterConfig { + label: "byline-noterm-number".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + matcher.set_line_term(Some(LineTerminator::byte(b'\n'))); + configs.push(TesterConfig { + label: "byline-term-number".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + matcher.every_line_is_candidate(true); + configs.push(TesterConfig { + label: "byline-term-number-candidates".to_string(), + expected_reader: expected_reader.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + } + if self.multi_line { + let mut builder = builder.clone(); + let expected_slice = match self.expected_slice_no_line_number { + None => { + self.expected_no_line_number.as_ref().unwrap().to_string() + } + Some(ref e) => e.to_string(), + }; + + builder.multi_line(true); + configs.push(TesterConfig { + label: "multiline-nonumber".to_string(), + expected_reader: expected_slice.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + if self.auto_heap_limit { + builder.heap_limit(Some(self.minimal_heap_limit(true))); + configs.push(TesterConfig { + label: "multiline-nonumber-heaplimit".to_string(), + expected_reader: expected_slice.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + builder.heap_limit(None); + } + } + if self.multi_line && self.line_number { + let mut builder = builder.clone(); + let expected_slice = match self.expected_slice_with_line_number { + None => { + self.expected_with_line_number + .as_ref().unwrap().to_string() + } + Some(ref e) => e.to_string(), + }; + + builder.multi_line(true); + builder.line_number(true); + configs.push(TesterConfig { + label: "multiline-number".to_string(), + expected_reader: expected_slice.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + + builder.heap_limit(Some(self.minimal_heap_limit(true))); + configs.push(TesterConfig { + label: "multiline-number-heaplimit".to_string(), + expected_reader: expected_slice.clone(), + expected_slice: expected_slice.clone(), + builder: builder.clone(), + matcher: matcher.clone(), + }); + builder.heap_limit(None); + } + configs + } +} + +#[derive(Debug)] +struct TesterConfig { + label: String, + expected_reader: String, + expected_slice: String, + builder: SearcherBuilder, + matcher: RegexMatcher, +} + +impl TesterConfig { + /// Execute a search using a reader. This exercises the incremental search + /// strategy, where the entire contents of the corpus aren't necessarily + /// in memory at once. + fn search_reader(&self, haystack: &str) -> String { + let mut sink = KitchenSink::new(); + let mut searcher = self.builder.build(); + let result = searcher.search_reader( + &self.matcher, + haystack.as_bytes(), + &mut sink, + ); + if let Err(err) = result { + let label = format!("reader-{}", self.label); + panic!("error running '{}': {}", label, err); + } + String::from_utf8(sink.as_bytes().to_vec()).unwrap() + } + + /// Execute a search using a slice. This exercises the search routines that + /// have the entire contents of the corpus in memory at one time. + fn search_slice(&self, haystack: &str) -> String { + let mut sink = KitchenSink::new(); + let mut searcher = self.builder.build(); + let result = searcher.search_slice( + &self.matcher, + haystack.as_bytes(), + &mut sink, + ); + if let Err(err) = result { + let label = format!("slice-{}", self.label); + panic!("error running '{}': {}", label, err); + } + String::from_utf8(sink.as_bytes().to_vec()).unwrap() + } +} + +#[cfg(test)] +mod tests { + use grep_matcher::{Match, Matcher}; + + use super::*; + + fn m(start: usize, end: usize) -> Match { + Match::new(start, end) + } + + #[test] + fn empty_line1() { + let haystack = b""; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); + } + + #[test] + fn empty_line2() { + let haystack = b"\n"; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); + assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1)))); + } + + #[test] + fn empty_line3() { + let haystack = b"\n\n"; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(0, 0)))); + assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(1, 1)))); + assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); + } + + #[test] + fn empty_line4() { + let haystack = b"a\n\nb\n"; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 3), Ok(Some(m(5, 5)))); + assert_eq!(matcher.find_at(haystack, 4), Ok(Some(m(5, 5)))); + assert_eq!(matcher.find_at(haystack, 5), Ok(Some(m(5, 5)))); + } + + #[test] + fn empty_line5() { + let haystack = b"a\n\nb\nc"; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 3), Ok(None)); + assert_eq!(matcher.find_at(haystack, 4), Ok(None)); + assert_eq!(matcher.find_at(haystack, 5), Ok(None)); + assert_eq!(matcher.find_at(haystack, 6), Ok(None)); + } + + #[test] + fn empty_line6() { + let haystack = b"a\n"; + let matcher = RegexMatcher::new(r"^$"); + + assert_eq!(matcher.find_at(haystack, 0), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 1), Ok(Some(m(2, 2)))); + assert_eq!(matcher.find_at(haystack, 2), Ok(Some(m(2, 2)))); + } +} diff --git a/grep2/COPYING b/grep2/COPYING new file mode 100644 index 000000000..bb9c20a09 --- /dev/null +++ b/grep2/COPYING @@ -0,0 +1,3 @@ +This project is dual-licensed under the Unlicense and MIT licenses. + +You may use this code under the terms of either license. diff --git a/grep2/Cargo.toml b/grep2/Cargo.toml new file mode 100644 index 000000000..caaf7a9cb --- /dev/null +++ b/grep2/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "grep2" +version = "0.2.0" #:version +authors = ["Andrew Gallant "] +description = """ +Fast line oriented regex searching as a library. +""" +documentation = "http://burntsushi.net/rustdoc/grep/" +homepage = "https://github.com/BurntSushi/ripgrep" +repository = "https://github.com/BurntSushi/ripgrep" +readme = "README.md" +keywords = ["regex", "grep", "egrep", "search", "pattern"] +license = "Unlicense/MIT" + +[dependencies] +grep-matcher = { version = "0.0.1", path = "../grep-matcher" } +grep-printer = { version = "0.0.1", path = "../grep-printer" } +grep-regex = { version = "0.0.1", path = "../grep-regex" } +grep-searcher = { version = "0.0.1", path = "../grep-searcher" } + +[features] +avx-accel = ["grep-searcher/avx-accel"] +simd-accel = ["grep-searcher/simd-accel"] diff --git a/grep2/LICENSE-MIT b/grep2/LICENSE-MIT new file mode 100644 index 000000000..3b0a5dc09 --- /dev/null +++ b/grep2/LICENSE-MIT @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Andrew Gallant + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/grep2/README.md b/grep2/README.md new file mode 100644 index 000000000..86cc8c2c7 --- /dev/null +++ b/grep2/README.md @@ -0,0 +1,4 @@ +grep +---- +This is a *library* that provides grep-style line-by-line regex searching (with +comparable performance to `grep` itself). diff --git a/grep2/UNLICENSE b/grep2/UNLICENSE new file mode 100644 index 000000000..68a49daad --- /dev/null +++ b/grep2/UNLICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to diff --git a/grep2/src/lib.rs b/grep2/src/lib.rs new file mode 100644 index 000000000..b6e026846 --- /dev/null +++ b/grep2/src/lib.rs @@ -0,0 +1,10 @@ +/*! +TODO. +*/ + +#![deny(missing_docs)] + +pub extern crate grep_matcher as matcher; +pub extern crate grep_printer as printer; +pub extern crate grep_regex as regex; +pub extern crate grep_searcher as searcher;