Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

lib/fs: Add ASCII fastpath for normalization #9365

Draft
wants to merge 18 commits into
base: main
Choose a base branch
from
74 changes: 54 additions & 20 deletions lib/fs/folding.go
Expand Up @@ -9,41 +9,75 @@ package fs
import (
"strings"
"unicode"
"unicode/utf8"

"golang.org/x/text/unicode/norm"
)

// UnicodeLowercaseNormalized returns the Unicode lower case variant of s,
// having also normalized it to normalization form C.
func UnicodeLowercaseNormalized(s string) string {
i := firstCaseChange(s)
if i == -1 {
return norm.NFC.String(s)
if isASCII, isLower := isASCII(s); isASCII {
if isLower {
return s
}
return toLowerASCII(s)
}

var rs strings.Builder
// WriteRune always reserves utf8.UTFMax bytes for non-ASCII runes,
// even if it doesn't need all that space. Overallocate now to prevent
// it from ever triggering a reallocation.
rs.Grow(utf8.UTFMax - 1 + len(s))
rs.WriteString(s[:i])
return toLowerUnicode(s)
}

for _, r := range s[i:] {
rs.WriteRune(unicode.ToLower(unicode.ToUpper(r)))
func isASCII(s string) (bool, bool) {
isLower := true
for i := 0; i < len(s); i++ {
c := s[i]
if c > unicode.MaxASCII {
return false, isLower
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given unicode_lowercase is the slow bench result with the new implementatin, maybe it would be worth testing keeping that behaviour from the old code. I.e. when encountering non-ascii, keep checking if there's any upper-case chars (and then exit). So you can then take the lower-case-unicode shortcut from before. This will make the unicode-mixed-case slower, but I wouldn't be surprised if not by a lot compared to the gains for unicode-lower.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the point I don't really understand. golangs strings.Map() has that shortcut built into it:

https://cs.opensource.google/go/go/+/refs/tags/go1.21.6:src/strings/strings.go;l=489-517

My previous (flawed?) mixedcase unicode benchmark case should have stopped at the very first byte in the ASCII detection. The rest of the code looks pretty much the same to me? Map the runes and if a case change is detected, assign a string builder with the unchanged part and write mapped runes after that.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I completely glossed over the usage of strings.Map xD

That's still one or two more loops. If you did the unicode lower detection in our initial loop that does ascii detection, and then not use strings.Map, we get rid of part of a loop in both the unicode cases.
Now that also makes me wonder if we could wrap isASCII and toLowerAscii into a mapping function for strings.Map for less code branches/complexity, and equal, maybe even slightly better performance :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also we are now mixing two classes of optimisations: Improving things about the unicode path, and adding an ascii shortcut. Maybe worth doing that separately for optimal results. Just an idle, optional suggestion though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ASCII path iterates over single bytes which is fast but doesn't work as soon as any unicode characters are involved. Iterating over runes is a tad slower but necessary in that case.

So you either end up with two loops or one rune based loop which is slower for ASCII inputs.

I also tried to get rid of lower(upper(r)) as far as possible as there are only 23(!) characters in unicode that satisfy lower(r) != lower(upper(r))

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found it, but apparently the type isn't exported by the unicode package which renders the SpecialCase unusable for us.

Copy link
Contributor Author

@bt90 bt90 Jan 29, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naive code generation attempt:

import (
	"fmt"
	"unicode"
)

func main() {
	fmt.Println("unicode.SpecialCase{")
	for r := rune(0); r <= unicode.MaxRune; r++ {
		lower := unicode.ToLower(r)
		lowerUpper := unicode.ToLower(unicode.ToUpper(r))
		if lower == lowerUpper {
			continue
		}
		hr := fmt.Sprintf("0x%04x", r)
		lr := fmt.Sprintf("0x%04x", lowerUpper)
		fmt.Printf("\tunicode.CaseRange{%s, %s, d{0, %s - %s, 0}}, // %s\n", hr, hr, lr, hr, string(r))
	}
	fmt.Println("}")
}
unicode.SpecialCase{
	unicode.CaseRange{0x00b5, 0x00b5, d{0, 0x03bc - 0x00b5, 0}}, // µ
	unicode.CaseRange{0x0131, 0x0131, d{0, 0x0069 - 0x0131, 0}}, // ı
	unicode.CaseRange{0x017f, 0x017f, d{0, 0x0073 - 0x017f, 0}}, // ſ
	unicode.CaseRange{0x0345, 0x0345, d{0, 0x03b9 - 0x0345, 0}}, // ͅ
	unicode.CaseRange{0x03c2, 0x03c2, d{0, 0x03c3 - 0x03c2, 0}}, // ς
	unicode.CaseRange{0x03d0, 0x03d0, d{0, 0x03b2 - 0x03d0, 0}}, // ϐ
	unicode.CaseRange{0x03d1, 0x03d1, d{0, 0x03b8 - 0x03d1, 0}}, // ϑ
	unicode.CaseRange{0x03d5, 0x03d5, d{0, 0x03c6 - 0x03d5, 0}}, // ϕ
	unicode.CaseRange{0x03d6, 0x03d6, d{0, 0x03c0 - 0x03d6, 0}}, // ϖ
	unicode.CaseRange{0x03f0, 0x03f0, d{0, 0x03ba - 0x03f0, 0}}, // ϰ
	unicode.CaseRange{0x03f1, 0x03f1, d{0, 0x03c1 - 0x03f1, 0}}, // ϱ
	unicode.CaseRange{0x03f5, 0x03f5, d{0, 0x03b5 - 0x03f5, 0}}, // ϵ
	unicode.CaseRange{0x1c80, 0x1c80, d{0, 0x0432 - 0x1c80, 0}}, // ᲀ
	unicode.CaseRange{0x1c81, 0x1c81, d{0, 0x0434 - 0x1c81, 0}}, // ᲁ
	unicode.CaseRange{0x1c82, 0x1c82, d{0, 0x043e - 0x1c82, 0}}, // ᲂ
	unicode.CaseRange{0x1c83, 0x1c83, d{0, 0x0441 - 0x1c83, 0}}, // ᲃ
	unicode.CaseRange{0x1c84, 0x1c84, d{0, 0x0442 - 0x1c84, 0}}, // ᲄ
	unicode.CaseRange{0x1c85, 0x1c85, d{0, 0x0442 - 0x1c85, 0}}, // ᲅ
	unicode.CaseRange{0x1c86, 0x1c86, d{0, 0x044a - 0x1c86, 0}}, // ᲆ
	unicode.CaseRange{0x1c87, 0x1c87, d{0, 0x0463 - 0x1c87, 0}}, // ᲇ
	unicode.CaseRange{0x1c88, 0x1c88, d{0, 0xa64b - 0x1c88, 0}}, // ᲈ
	unicode.CaseRange{0x1e9b, 0x1e9b, d{0, 0x1e61 - 0x1e9b, 0}}, // ẛ
	unicode.CaseRange{0x1fbe, 0x1fbe, d{0, 0x03b9 - 0x1fbe, 0}}, // ι
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the exceptions are that few it certainly seems valid to pre-generate the table and then use that plus regular ToLower in order to save on the more complex operations.

Copy link
Member

@calmh calmh Jan 31, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the unexported d is a bummer. Wonder what the thinking there is.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I played a bit with unicode.RangeTable but couldn't improve the performance:

var complexFolds = &unicode.RangeTable{
	R16: []unicode.Range16{
		{181, 305, 124},
		{383, 837, 454},
		{962, 976, 14},
		{977, 981, 4},
		{982, 1008, 26},
		{1009, 1013, 4},
		{7296, 7304, 1},
		{7835, 8126, 291},
	},
	R32:         []unicode.Range32{},
	LatinOffset: 0,
}

func toLower(r rune) rune {
	if r <= unicode.MaxASCII {
		if r < 'A' || 'Z' < r {
			return r
		}
		return r + 'a' - 'A'
	}
	if unicode.Is(complexFolds, r) {
		return unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r))
	}
	return unicode.To(unicode.LowerCase, r)
}

}
if 'A' <= c && c <= 'Z' {
isLower = false
}
}
return norm.NFC.String(rs.String())
return true, isLower
}

// Byte index of the first rune r s.t. lower(upper(r)) != r.
func firstCaseChange(s string) int {
for i, r := range s {
if r <= unicode.MaxASCII && (r < 'A' || r > 'Z') {
func toLowerASCII(s string) string {
var (
b strings.Builder
pos int
)
b.Grow(len(s))
for i := 0; i < len(s); i++ {
c := s[i]
if c < 'A' || 'Z' < c {
continue
}
if unicode.ToLower(unicode.ToUpper(r)) != r {
return i
if pos < i {
b.WriteString(s[pos:i])
}
pos = i + 1
c += 'a' - 'A'
b.WriteByte(c)
}
if pos != len(s) {
b.WriteString(s[pos:])
}
return b.String()
}

func toLowerUnicode(s string) string {
s = strings.Map(toLower, s)
return norm.NFC.String(s)
}

func toLower(r rune) rune {
if r <= unicode.MaxASCII {
if r < 'A' || 'Z' < r {
return r
}
return r + 'a' - 'A'
}
if r <= unicode.MaxLatin1 && r != 'µ' {
return unicode.To(unicode.LowerCase, r)
}
return -1
return unicode.To(unicode.LowerCase, unicode.To(unicode.UpperCase, r))
}
37 changes: 20 additions & 17 deletions lib/fs/folding_test.go
Expand Up @@ -49,6 +49,18 @@ var caseCases = [][2]string{
{"a\xCC\x88", "\xC3\xA4"}, // ä
}

var benchmarkCases = [][2]string{
{"img_202401241010.jpg", "ASCII lowercase"},
{"IMG_202401241010.jpg", "ASCII mixedcase start"},
{"img_202401241010.JPG", "ASCII mixedcase end"},
{"wir_kinder_aus_bullerbü.epub", "Latin1 lowercase"},
{"Wir_Kinder_aus_Bullerbü.epub", "Latin1 mixedcase start"},
{"wir_kinder_aus_bullerbü.EPUB", "Latin1 mixedcase end"},
{"translated_ウェブの国際化.html", "Unicode lowercase"},
{"Translated_ウェブの国際化.html", "Unicode mixedcase start"},
{"translated_ウェブの国際化.HTML", "Unicode mixedcase end"},
}

func TestUnicodeLowercaseNormalized(t *testing.T) {
for _, tc := range caseCases {
res := UnicodeLowercaseNormalized(tc[0])
Expand All @@ -58,22 +70,13 @@ func TestUnicodeLowercaseNormalized(t *testing.T) {
}
}

func BenchmarkUnicodeLowercaseMaybeChange(b *testing.B) {
b.ReportAllocs()

for i := 0; i < b.N; i++ {
for _, s := range caseCases {
UnicodeLowercaseNormalized(s[0])
}
}
}

func BenchmarkUnicodeLowercaseNoChange(b *testing.B) {
b.ReportAllocs()

for i := 0; i < b.N; i++ {
for _, s := range caseCases {
UnicodeLowercaseNormalized(s[1])
}
func BenchmarkUnicodeLowercase(b *testing.B) {
for _, c := range benchmarkCases {
b.Run(c[1], func(b *testing.B) {
b.ReportAllocs()
for i := 0; i < b.N; i++ {
UnicodeLowercaseNormalized(c[0])
}
})
}
}