Skip to content

Commit

Permalink
Modify benchmarks to compare against stdlib functions
Browse files Browse the repository at this point in the history
This commit refactors and expands the microbenchmarks in order to
evaluate the performance hit of handling full unicode. It is expected
that `unicode-segmentation`'s functions are slower since they consider
graphemes, the question is just how much.

- bump criterion dependency
- rename benchmarks to remove unicode/grapheme relationship
- move benchmarks into benchmark group
- add scalar versions with stdlib "equivalents" (scalars)
  • Loading branch information
Jonas-Heinrich committed Apr 16, 2024
1 parent baab923 commit d6a6eb8
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 184 deletions.
7 changes: 3 additions & 4 deletions Cargo.toml
Expand Up @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.

[dev-dependencies]
quickcheck = "0.7"
criterion = "0.3"
criterion = "0.5"

[[bench]]
name = "graphemes"
name = "chars"
harness = false

[[bench]]
name = "unicode_words"
name = "words"
harness = false

[[bench]]
name = "word_bounds"
harness = false

60 changes: 60 additions & 0 deletions benches/chars.rs
@@ -0,0 +1,60 @@
//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
//! `std::str::chars`.
//!
//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
//! does not consider the complexity of grapheme clusters. The question in this benchmark
//! is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use unicode_segmentation;

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
black_box(c);
}
}

#[inline(always)]
fn scalar(text: &str) {
for c in black_box(&*text).chars() {
black_box(c);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("chars");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
63 changes: 0 additions & 63 deletions benches/graphemes.rs

This file was deleted.

61 changes: 0 additions & 61 deletions benches/unicode_words.rs

This file was deleted.

88 changes: 32 additions & 56 deletions benches/word_bounds.rs
@@ -1,61 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
c.bench_function(&format!("word_bounds_{}", lang), |bench| {
bench.iter(|| {
for w in text.split_word_bounds() {
black_box(w);
}
});
});
}

fn word_bounds_arabic(c: &mut Criterion) {
word_bounds(c, "arabic", "benches/texts/arabic.txt");
}

fn word_bounds_english(c: &mut Criterion) {
word_bounds(c, "english", "benches/texts/english.txt");
}

fn word_bounds_hindi(c: &mut Criterion) {
word_bounds(c, "hindi", "benches/texts/hindi.txt");
}

fn word_bounds_japanese(c: &mut Criterion) {
word_bounds(c, "japanese", "benches/texts/japanese.txt");
}

fn word_bounds_korean(c: &mut Criterion) {
word_bounds(c, "korean", "benches/texts/korean.txt");
}

fn word_bounds_mandarin(c: &mut Criterion) {
word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
}

fn word_bounds_russian(c: &mut Criterion) {
word_bounds(c, "russian", "benches/texts/russian.txt");
}

fn word_bounds_source_code(c: &mut Criterion) {
word_bounds(c, "source_code", "benches/texts/source_code.txt");
}

criterion_group!(
benches,
word_bounds_arabic,
word_bounds_english,
word_bounds_hindi,
word_bounds_japanese,
word_bounds_korean,
word_bounds_mandarin,
word_bounds_russian,
word_bounds_source_code,
);

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.split_word_bounds() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("word_bounds");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
59 changes: 59 additions & 0 deletions benches/words.rs
@@ -0,0 +1,59 @@
//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
//! scalar-based `std::str::split_whitespace`.
//!
//! It is expected that `std::str::split_whitespace` is faster than
//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
//! clusters. The question in this benchmark is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_words() {
black_box(w);
}
}

#[inline(always)]
fn scalar(text: &str) {
for w in text.split_whitespace() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("words");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);

0 comments on commit d6a6eb8

Please sign in to comment.