Skip to content

Commit

Permalink
Filters (#12)
Browse files Browse the repository at this point in the history
Implement `Filter()` on `Segmenter` and `Scanner` and `SegmentAll`. See `Wordlike` for an example.
  • Loading branch information
clipperhouse committed May 26, 2022
1 parent 556475b commit 619311b
Show file tree
Hide file tree
Showing 21 changed files with 764 additions and 214 deletions.
11 changes: 0 additions & 11 deletions .editorconfig

This file was deleted.

2 changes: 1 addition & 1 deletion doc.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Package uax29 provides Unicode text segmentation (UAX #29) for words, sentences and graphemes.
//
// See the words, sentences, and graphemes packages for implementations.
// See the words, sentences, and graphemes packages for details and usage.
//
// For more information on the UAX #29 spec: https://unicode.org/reports/tr29/
package uax29
10 changes: 5 additions & 5 deletions graphemes/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
package graphemes

import (
"bufio"
"io"

"github.com/clipperhouse/uax29/iterators"
)

// NewScanner returns a bufio.Scanner, to tokenize graphemes per https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
// Iterate through graphemes by calling Scan() until false. See the bufio.Scanner docs for details.
func NewScanner(r io.Reader) *bufio.Scanner {
scanner := bufio.NewScanner(r)
scanner.Split(SplitFunc)
// Iterate through graphemes by calling Scan() until false. See also the bufio.Scanner docs.
func NewScanner(r io.Reader) *iterators.Scanner {
scanner := iterators.NewScanner(r, SplitFunc)
return scanner
}
13 changes: 8 additions & 5 deletions graphemes/segmenter.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
package graphemes

import "github.com/clipperhouse/uax29/segmenter"
import (
"github.com/clipperhouse/uax29/iterators"
"github.com/clipperhouse/uax29/iterators/filter"
)

// NewSegmenter retuns a Segmenter, which is an iterator over the source text.
// Iterate while Next() is true, and access the segmented graphemes via Bytes().
func NewSegmenter(data []byte) *segmenter.Segmenter {
seg := segmenter.New(SplitFunc)
func NewSegmenter(data []byte) *iterators.Segmenter {
seg := iterators.NewSegmenter(SplitFunc)
seg.SetText(data)
return seg
}
Expand All @@ -15,11 +18,11 @@ func NewSegmenter(data []byte) *segmenter.Segmenter {
// this will save you some code. The downside is that this allocation is
// unbounded -- O(n) on the number of tokens. Use Segmenter for more bounded
// memory usage.
func SegmentAll(data []byte) [][]byte {
func SegmentAll(data []byte, predicate ...filter.Predicate) [][]byte {
// Optimization: guesstimate that the average grapheme is 1 bytes,
// allocate a large enough array to avoid resizing
result := make([][]byte, 0, len(data))

_ = segmenter.All(data, &result, SplitFunc) // can elide the error, see tests
_ = iterators.All(data, &result, SplitFunc, predicate...) // can elide the error, see tests
return result
}
57 changes: 57 additions & 0 deletions iterators/filter/filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
package filter

import (
"unicode"
"unicode/utf8"

"github.com/clipperhouse/uax29/iterators/util"
)

type Predicate func([]byte) bool

// Contains returns a filter (predicate) indicating that a segment (token) contains one
// or more runes that are in one or more of the given ranges. Examples of ranges
// are things like unicode.Letter, unicode.Arabic, or unicode.Lower,
// allowing testing for a wide variety of character or script types.
//
// Intended for passing to segmenter.Filter or scanner.Filter.
//
// If the given token is empty, or no ranges are given, it will return false.
func Contains(ranges ...*unicode.RangeTable) Predicate {
return func(token []byte) bool {
return util.Contains(token, ranges...)
}
}

// Entirely returns a filter (predicate) indicating that a segment (token)
// consists entirely of runes that are in one or more of the given ranges.
// Examples of ranges are things like unicode.Letter, unicode.Arabic,
// or unicode.Lower, allowing testing for a wide variety of character
// or script types.
//
// Intended for passing to segmenter.Filter or scanner.Filter.
//
// If the given token is empty, or no ranges are given, it will return false.
func Entirely(ranges ...*unicode.RangeTable) Predicate {
return func(token []byte) bool {
return util.Entirely(token, ranges...)
}
}

// Wordlike is a filter which returns only tokens (segments) that are "words"
// in the common sense, excluding tokens that are whitespace or punctuation.
// It includes any token that contains a Letter, Number, or Symbol, as defined
// by Unicode. To use it, call Filter(Wordlike) on a Segmenter or Scanner.
var Wordlike Predicate = func(token []byte) bool {
// Hotpath version, faster than using Contains with Rangetables
pos := 0
for pos < len(token) {
r, w := utf8.DecodeRune(token[pos:])
if unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsSymbol(r) {
return true
}
pos += w
}

return false
}
59 changes: 59 additions & 0 deletions iterators/filter/filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
package filter_test

import (
"testing"
"unicode"

"github.com/clipperhouse/uax29/iterators/filter"
)

func TestContains(t *testing.T) {
type test struct {
input string
expected bool
}

tests := []test{
{"", false},
{"👍🐶", false},
{"Hello", true},
{"Hello, 世界.", true},
{"世界", true},
}

f := filter.Contains(unicode.Latin, unicode.Ideographic)

for _, test := range tests {
got := f([]byte(test.input))

if got != test.expected {
t.Error(test.expected)
}
}
}

func TestEntirely(t *testing.T) {
type test struct {
input string
expected bool
}

tests := []test{
{"", false},
{"👍🐶", false},
{"Hello", true},
{"Hello世界", true},
{"Hello ", false},
{"Hello,世界", false},
}

f := filter.Entirely(unicode.Latin, unicode.Ideographic)

for _, test := range tests {
got := f([]byte(test.input))

if got != test.expected {
t.Error(test.expected)
}
}
}
53 changes: 53 additions & 0 deletions iterators/scanner.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package iterators

import (
"bufio"
"io"

"github.com/clipperhouse/uax29/iterators/filter"
)

type s = bufio.Scanner

type Scanner struct {
s
predicates []filter.Predicate
}

// NewScanner creates a new Scanner given an io.Reader and bufio.SplitFunc. To use the new scanner,
// iterate while Scan() is true. See also the bufio.Scanner docs.
func NewScanner(r io.Reader, split bufio.SplitFunc) *Scanner {
sc := &Scanner{
s: *bufio.NewScanner(r),
}
sc.s.Split(split)
return sc
}

// Filter applies one or more filters (predicates) to all tokens (segments), only returning those
// where all predicates evaluate true.
func (sc *Scanner) Filter(predicates ...filter.Predicate) {
sc.predicates = predicates
}

func (sc *Scanner) Scan() bool {
scan := true

outer:
for scan {
scan = sc.s.Scan()
if !scan {
break
}

for _, f := range sc.predicates {
if !f(sc.Bytes()) {
continue outer
}
}

return scan
}

return scan
}
73 changes: 73 additions & 0 deletions iterators/scanner_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package iterators_test

import (
"bufio"
"bytes"
"strings"
"testing"

"github.com/clipperhouse/uax29/iterators"
"github.com/clipperhouse/uax29/words"
)

func TestScannerSameAsBufio(t *testing.T) {
splits := []bufio.SplitFunc{words.SplitFunc, bufio.ScanWords}
for _, split := range splits {
for i := 0; i < 100; i++ {
text := getRandomBytes()

r1 := bytes.NewReader(text)
sc1 := iterators.NewScanner(r1, split)
r2 := bytes.NewReader(text)
sc2 := bufio.NewScanner(r2)
sc2.Split(split)

for sc1.Scan() && sc2.Scan() {
if !bytes.Equal(sc1.Bytes(), sc2.Bytes()) {
t.Fatal("Scanner and bufio.Scanner should give identical results")
}
}
}
}
}

func TestScannerFilterIsApplied(t *testing.T) {
text := "Hello, 世界, how are you? Nice dog aha! 👍🐶"

{
r := strings.NewReader(text)
sc := iterators.NewScanner(r, bufio.ScanWords)
sc.Filter(startsWithH)

count := 0
for sc.Scan() {
if !startsWithH(sc.Bytes()) {
t.Fatal("filter was not applied")
}
count++
}

if count != 2 {
t.Fatalf("scanner filter should have found 2 results, got %d", count)
}
}

{
// variadic
r := strings.NewReader(text)
sc := iterators.NewScanner(r, bufio.ScanWords)
sc.Filter(startsWithH, endsWithW)

count := 0
for sc.Scan() {
if !(startsWithH(sc.Bytes()) && endsWithW(sc.Bytes())) {
t.Fatal("variadic scanner filter was not applied")
}
count++
}

if count != 1 {
t.Fatalf("variadic scanner filter should have found 1 result, got %d", count)
}
}
}

0 comments on commit 619311b

Please sign in to comment.