Filters (#12)

Implement `Filter()` on `Segmenter` and `Scanner` and `SegmentAll`. See `Wordlike` for an example.
clipperhouse · May 26, 2022 · 619311b · 619311b
1 parent 556475b
commit 619311b
Show file tree

Hide file tree

Showing 21 changed files with 764 additions and 214 deletions.
diff --git a/.editorconfig b/.editorconfig
diff --git a/doc.go b/doc.go
@@ -1,6 +1,6 @@
 // Package uax29 provides Unicode text segmentation (UAX #29) for words, sentences and graphemes.
 //
-// See the words, sentences, and graphemes packages for implementations.
+// See the words, sentences, and graphemes packages for details and usage.
 //
 // For more information on the UAX #29 spec: https://unicode.org/reports/tr29/
 package uax29
diff --git a/graphemes/scanner.go b/graphemes/scanner.go
@@ -2,14 +2,14 @@
 package graphemes
 
 import (
-	"bufio"
 	"io"
+
+	"github.com/clipperhouse/uax29/iterators"
 )
 
 // NewScanner returns a bufio.Scanner, to tokenize graphemes per https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
-// Iterate through graphemes by calling Scan() until false. See the bufio.Scanner docs for details.
-func NewScanner(r io.Reader) *bufio.Scanner {
-	scanner := bufio.NewScanner(r)
-	scanner.Split(SplitFunc)
+// Iterate through graphemes by calling Scan() until false. See also the bufio.Scanner docs.
+func NewScanner(r io.Reader) *iterators.Scanner {
+	scanner := iterators.NewScanner(r, SplitFunc)
 	return scanner
 }
diff --git a/graphemes/segmenter.go b/graphemes/segmenter.go
@@ -1,11 +1,14 @@
 package graphemes
 
-import "github.com/clipperhouse/uax29/segmenter"
+import (
+	"github.com/clipperhouse/uax29/iterators"
+	"github.com/clipperhouse/uax29/iterators/filter"
+)
 
 // NewSegmenter retuns a Segmenter, which is an iterator over the source text.
 // Iterate while Next() is true, and access the segmented graphemes via Bytes().
-func NewSegmenter(data []byte) *segmenter.Segmenter {
-	seg := segmenter.New(SplitFunc)
+func NewSegmenter(data []byte) *iterators.Segmenter {
+	seg := iterators.NewSegmenter(SplitFunc)
 	seg.SetText(data)
 	return seg
 }
@@ -15,11 +18,11 @@ func NewSegmenter(data []byte) *segmenter.Segmenter {
 // this will save you some code. The downside is that this allocation is
 // unbounded -- O(n) on the number of tokens. Use Segmenter for more bounded
 // memory usage.
-func SegmentAll(data []byte) [][]byte {
+func SegmentAll(data []byte, predicate ...filter.Predicate) [][]byte {
 	// Optimization: guesstimate that the average grapheme is 1 bytes,
 	// allocate a large enough array to avoid resizing
 	result := make([][]byte, 0, len(data))
 
-	_ = segmenter.All(data, &result, SplitFunc) // can elide the error, see tests
+	_ = iterators.All(data, &result, SplitFunc, predicate...) // can elide the error, see tests
 	return result
 }
diff --git a/iterators/filter/filter.go b/iterators/filter/filter.go
@@ -0,0 +1,57 @@
+package filter
+
+import (
+	"unicode"
+	"unicode/utf8"
+
+	"github.com/clipperhouse/uax29/iterators/util"
+)
+
+type Predicate func([]byte) bool
+
+// Contains returns a filter (predicate) indicating that a segment (token) contains one
+// or more runes that are in one or more of the given ranges. Examples of ranges
+// are things like unicode.Letter, unicode.Arabic, or unicode.Lower,
+// allowing testing for a wide variety of character or script types.
+//
+// Intended for passing to segmenter.Filter or scanner.Filter.
+//
+// If the given token is empty, or no ranges are given, it will return false.
+func Contains(ranges ...*unicode.RangeTable) Predicate {
+	return func(token []byte) bool {
+		return util.Contains(token, ranges...)
+	}
+}
+
+// Entirely returns a filter (predicate) indicating that a segment (token)
+// consists entirely of runes that are in one or more of the given ranges.
+// Examples of ranges are things like unicode.Letter, unicode.Arabic,
+// or unicode.Lower, allowing testing for a wide variety of character
+// or script types.
+//
+// Intended for passing to segmenter.Filter or scanner.Filter.
+//
+// If the given token is empty, or no ranges are given, it will return false.
+func Entirely(ranges ...*unicode.RangeTable) Predicate {
+	return func(token []byte) bool {
+		return util.Entirely(token, ranges...)
+	}
+}
+
+// Wordlike is a filter which returns only tokens (segments) that are "words"
+// in the common sense, excluding tokens that are whitespace or punctuation.
+// It includes any token that contains a Letter, Number, or Symbol, as defined
+// by Unicode. To use it, call Filter(Wordlike) on a Segmenter or Scanner.
+var Wordlike Predicate = func(token []byte) bool {
+	// Hotpath version, faster than using Contains with Rangetables
+	pos := 0
+	for pos < len(token) {
+		r, w := utf8.DecodeRune(token[pos:])
+		if unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsSymbol(r) {
+			return true
+		}
+		pos += w
+	}
+
+	return false
+}
diff --git a/iterators/filter/filter_test.go b/iterators/filter/filter_test.go
@@ -0,0 +1,59 @@
+package filter_test
+
+import (
+	"testing"
+	"unicode"
+
+	"github.com/clipperhouse/uax29/iterators/filter"
+)
+
+func TestContains(t *testing.T) {
+	type test struct {
+		input    string
+		expected bool
+	}
+
+	tests := []test{
+		{"", false},
+		{"👍🐶", false},
+		{"Hello", true},
+		{"Hello, 世界.", true},
+		{"世界", true},
+	}
+
+	f := filter.Contains(unicode.Latin, unicode.Ideographic)
+
+	for _, test := range tests {
+		got := f([]byte(test.input))
+
+		if got != test.expected {
+			t.Error(test.expected)
+		}
+	}
+}
+
+func TestEntirely(t *testing.T) {
+	type test struct {
+		input    string
+		expected bool
+	}
+
+	tests := []test{
+		{"", false},
+		{"👍🐶", false},
+		{"Hello", true},
+		{"Hello世界", true},
+		{"Hello ", false},
+		{"Hello,世界", false},
+	}
+
+	f := filter.Entirely(unicode.Latin, unicode.Ideographic)
+
+	for _, test := range tests {
+		got := f([]byte(test.input))
+
+		if got != test.expected {
+			t.Error(test.expected)
+		}
+	}
+}
diff --git a/iterators/scanner.go b/iterators/scanner.go
@@ -0,0 +1,53 @@
+package iterators
+
+import (
+	"bufio"
+	"io"
+
+	"github.com/clipperhouse/uax29/iterators/filter"
+)
+
+type s = bufio.Scanner
+
+type Scanner struct {
+	s
+	predicates []filter.Predicate
+}
+
+// NewScanner creates a new Scanner given an io.Reader and bufio.SplitFunc. To use the new scanner,
+// iterate while Scan() is true. See also the bufio.Scanner docs.
+func NewScanner(r io.Reader, split bufio.SplitFunc) *Scanner {
+	sc := &Scanner{
+		s: *bufio.NewScanner(r),
+	}
+	sc.s.Split(split)
+	return sc
+}
+
+// Filter applies one or more filters (predicates) to all tokens (segments), only returning those
+// where all predicates evaluate true.
+func (sc *Scanner) Filter(predicates ...filter.Predicate) {
+	sc.predicates = predicates
+}
+
+func (sc *Scanner) Scan() bool {
+	scan := true
+
+outer:
+	for scan {
+		scan = sc.s.Scan()
+		if !scan {
+			break
+		}
+
+		for _, f := range sc.predicates {
+			if !f(sc.Bytes()) {
+				continue outer
+			}
+		}
+
+		return scan
+	}
+
+	return scan
+}
diff --git a/iterators/scanner_test.go b/iterators/scanner_test.go
@@ -0,0 +1,73 @@
+package iterators_test
+
+import (
+	"bufio"
+	"bytes"
+	"strings"
+	"testing"
+
+	"github.com/clipperhouse/uax29/iterators"
+	"github.com/clipperhouse/uax29/words"
+)
+
+func TestScannerSameAsBufio(t *testing.T) {
+	splits := []bufio.SplitFunc{words.SplitFunc, bufio.ScanWords}
+	for _, split := range splits {
+		for i := 0; i < 100; i++ {
+			text := getRandomBytes()
+
+			r1 := bytes.NewReader(text)
+			sc1 := iterators.NewScanner(r1, split)
+			r2 := bytes.NewReader(text)
+			sc2 := bufio.NewScanner(r2)
+			sc2.Split(split)
+
+			for sc1.Scan() && sc2.Scan() {
+				if !bytes.Equal(sc1.Bytes(), sc2.Bytes()) {
+					t.Fatal("Scanner and bufio.Scanner should give identical results")
+				}
+			}
+		}
+	}
+}
+
+func TestScannerFilterIsApplied(t *testing.T) {
+	text := "Hello, 世界, how are you? Nice dog aha! 👍🐶"
+
+	{
+		r := strings.NewReader(text)
+		sc := iterators.NewScanner(r, bufio.ScanWords)
+		sc.Filter(startsWithH)
+
+		count := 0
+		for sc.Scan() {
+			if !startsWithH(sc.Bytes()) {
+				t.Fatal("filter was not applied")
+			}
+			count++
+		}
+
+		if count != 2 {
+			t.Fatalf("scanner filter should have found 2 results, got %d", count)
+		}
+	}
+
+	{
+		// variadic
+		r := strings.NewReader(text)
+		sc := iterators.NewScanner(r, bufio.ScanWords)
+		sc.Filter(startsWithH, endsWithW)
+
+		count := 0
+		for sc.Scan() {
+			if !(startsWithH(sc.Bytes()) && endsWithW(sc.Bytes())) {
+				t.Fatal("variadic scanner filter was not applied")
+			}
+			count++
+		}
+
+		if count != 1 {
+			t.Fatalf("variadic scanner filter should have found 1 result, got %d", count)
+		}
+	}
+}