Modify benchmarks to compare against stdlib functions

This commit refactors and expands the microbenchmarks in order to evaluate the performance hit of handling full unicode. It is expected that `unicode-segmentation`'s functions are slower since they consider graphemes, the question is just how much. - bump criterion dependency - rename benchmarks to remove unicode/grapheme relationship - move benchmarks into benchmark group - add scalar versions with stdlib "equivalents" (scalars)
unicode-rs · Apr 16, 2024 · d6a6eb8 · d6a6eb8
1 parent baab923
commit d6a6eb8
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 184 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
 
 [dev-dependencies]
 quickcheck = "0.7"
-criterion = "0.3"
+criterion = "0.5"
 
 [[bench]]
-name = "graphemes"
+name = "chars"
 harness = false
 
 [[bench]]
-name = "unicode_words"
+name = "words"
 harness = false
 
 [[bench]]
 name = "word_bounds"
 harness = false
-
diff --git a/benches/chars.rs b/benches/chars.rs
@@ -0,0 +1,60 @@
+//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
+//! `std::str::chars`.
+//!
+//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
+//! does not consider the complexity of grapheme clusters. The question in this benchmark
+//! is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use unicode_segmentation;
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
+        black_box(c);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for c in black_box(&*text).chars() {
+        black_box(c);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("chars");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
diff --git a/benches/graphemes.rs b/benches/graphemes.rs
diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
@@ -1,61 +1,37 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
 
-fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    c.bench_function(&format!("word_bounds_{}", lang), |bench| {
-        bench.iter(|| {
-            for w in text.split_word_bounds() {
-                black_box(w);
-            }
-        });
-    });
-}
-
-fn word_bounds_arabic(c: &mut Criterion) {
-    word_bounds(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn word_bounds_english(c: &mut Criterion) {
-    word_bounds(c, "english", "benches/texts/english.txt");
-}
-
-fn word_bounds_hindi(c: &mut Criterion) {
-    word_bounds(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn word_bounds_japanese(c: &mut Criterion) {
-    word_bounds(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn word_bounds_korean(c: &mut Criterion) {
-    word_bounds(c, "korean", "benches/texts/korean.txt");
-}
-
-fn word_bounds_mandarin(c: &mut Criterion) {
-    word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn word_bounds_russian(c: &mut Criterion) {
-    word_bounds(c, "russian", "benches/texts/russian.txt");
-}
-
-fn word_bounds_source_code(c: &mut Criterion) {
-    word_bounds(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    word_bounds_arabic,
-    word_bounds_english,
-    word_bounds_hindi,
-    word_bounds_japanese,
-    word_bounds_korean,
-    word_bounds_mandarin,
-    word_bounds_russian,
-    word_bounds_source_code,
-);
-
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.split_word_bounds() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("word_bounds");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
 criterion_main!(benches);
diff --git a/benches/words.rs b/benches/words.rs
@@ -0,0 +1,59 @@
+//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
+//! scalar-based `std::str::split_whitespace`.
+//!
+//! It is expected that `std::str::split_whitespace` is faster than
+//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
+//! clusters. The question in this benchmark is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_words() {
+        black_box(w);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for w in text.split_whitespace() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("words");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);