Skip to content

Commit

Permalink
Cleanups (#14)
Browse files Browse the repository at this point in the history
Docs & remove some redundancies
  • Loading branch information
clipperhouse committed Jun 5, 2022
1 parent dd355a5 commit 2b27b05
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 22 deletions.
2 changes: 1 addition & 1 deletion graphemes/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/clipperhouse/uax29/iterators"
)

// NewScanner Scanner, to tokenize graphemes per https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
// NewScanner returns a Scanner, to tokenize graphemes per https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries.
// Iterate through graphemes by calling Scan() until false, then check Err(). See also the bufio.Scanner docs.
func NewScanner(r io.Reader) *iterators.Scanner {
scanner := iterators.NewScanner(r, SplitFunc)
Expand Down
23 changes: 8 additions & 15 deletions iterators/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
"golang.org/x/text/transform"
)

type s = bufio.Scanner
type s = *bufio.Scanner

type Scanner struct {
s
Expand All @@ -28,7 +28,7 @@ type Scanner struct {
// iterate while Scan() is true. See also the bufio.Scanner docs.
func NewScanner(r io.Reader, split bufio.SplitFunc) *Scanner {
sc := &Scanner{
s: *bufio.NewScanner(r),
s: bufio.NewScanner(r),
}
sc.s.Split(split)

Expand Down Expand Up @@ -74,7 +74,7 @@ func (sc *Scanner) Transform(transformers ...transform.Transformer) {

// Gotta swap out the underlying bufio.Scanner. A little risky.
// See Scanner.scanCalled and Scanner.err for how we prevent misuse.
sc.s = *bufio.NewScanner(r)
sc.s = bufio.NewScanner(r)
sc.s.Split(sc.split)
}

Expand All @@ -88,23 +88,16 @@ func (sc *Scanner) Scan() bool {

sc.scanCalled = true

scan := true

outer:
for scan {
scan = sc.s.Scan()
if !scan {
break
}

scan:
for sc.s.Scan() {
for _, f := range sc.predicates {
if !f(sc.Bytes()) {
continue outer
continue scan
}
}

return scan
return true
}

return scan
return false
}
25 changes: 21 additions & 4 deletions iterators/segmenter.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package iterators

import (
"bufio"
"errors"

"github.com/clipperhouse/uax29/iterators/filter"
"golang.org/x/text/transform"
Expand Down Expand Up @@ -51,6 +52,9 @@ func (seg *Segmenter) Transform(transformers ...transform.Transformer) {
seg.transforms = transform.Chain(transformers...)
}

var ErrAdvanceNegative = errors.New("SplitFunc returned a negative advance")
var ErrAdvanceTooFar = errors.New("SplitFunc advanced beyond the end of the data")

// Next advances Segmenter to the next token (segment). It returns false when there
// are no remaining segments, or an error occurred.
func (seg *Segmenter) Next() bool {
Expand All @@ -61,18 +65,31 @@ outer:
seg.token = token
seg.err = err

if advance == 0 {
if seg.err != nil {
return false
}
if len(seg.token) == 0 {

// Guardrails
if advance < 0 {
seg.err = ErrAdvanceNegative
return false
}
if seg.err != nil {
if seg.pos > len(seg.data) {
seg.err = ErrAdvanceTooFar
return false
}

// Interpret as EOF
if advance == 0 {
return false
}

// Interpret as EOF
if len(seg.token) == 0 {
return false
}

if seg.transforms != nil {
seg.transforms.Reset() // recommended
seg.token, _, err = transform.Bytes(seg.transforms, seg.token)
if err != nil {
seg.err = err
Expand Down
2 changes: 1 addition & 1 deletion sentences/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/clipperhouse/uax29/iterators"
)

// NewScanner Scanner, to tokenize sentences per https://unicode.org/reports/tr29/#Sentence_Boundaries.
// NewScanner returns a Scanner, to tokenize sentences per https://unicode.org/reports/tr29/#Sentence_Boundaries.
// Iterate through sentences by calling Scan() until false, then check Err(). See also the bufio.Scanner docs.
func NewScanner(r io.Reader) *iterators.Scanner {
sc := iterators.NewScanner(r, SplitFunc)
Expand Down
2 changes: 1 addition & 1 deletion words/scanner.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import (
"github.com/clipperhouse/uax29/iterators"
)

// NewScanner Scanner, to tokenize words per https://unicode.org/reports/tr29/#Word_Boundaries.
// NewScanner returns a Scanner, to tokenize words per https://unicode.org/reports/tr29/#Word_Boundaries.
// Iterate through words by calling Scan() until false, then check Err(). See also the bufio.Scanner docs.
func NewScanner(r io.Reader) *iterators.Scanner {
sc := iterators.NewScanner(r, SplitFunc)
Expand Down

0 comments on commit 2b27b05

Please sign in to comment.