Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Modify benchmarks to compare against stdlib functions #133

Merged
merged 1 commit into from Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 3 additions & 4 deletions Cargo.toml
Expand Up @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.

[dev-dependencies]
quickcheck = "0.7"
criterion = "0.3"
criterion = "0.5"

[[bench]]
name = "graphemes"
name = "chars"
harness = false

[[bench]]
name = "unicode_words"
name = "words"
harness = false

[[bench]]
name = "word_bounds"
harness = false

60 changes: 60 additions & 0 deletions benches/chars.rs
@@ -0,0 +1,60 @@
//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
//! `std::str::chars`.
//!
//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
//! does not consider the complexity of grapheme clusters. The question in this benchmark
//! is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use unicode_segmentation;

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
black_box(c);
}
}

#[inline(always)]
fn scalar(text: &str) {
for c in black_box(&*text).chars() {
black_box(c);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("chars");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
63 changes: 0 additions & 63 deletions benches/graphemes.rs

This file was deleted.

61 changes: 0 additions & 61 deletions benches/unicode_words.rs

This file was deleted.

88 changes: 32 additions & 56 deletions benches/word_bounds.rs
@@ -1,61 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
c.bench_function(&format!("word_bounds_{}", lang), |bench| {
bench.iter(|| {
for w in text.split_word_bounds() {
black_box(w);
}
});
});
}

fn word_bounds_arabic(c: &mut Criterion) {
word_bounds(c, "arabic", "benches/texts/arabic.txt");
}

fn word_bounds_english(c: &mut Criterion) {
word_bounds(c, "english", "benches/texts/english.txt");
}

fn word_bounds_hindi(c: &mut Criterion) {
word_bounds(c, "hindi", "benches/texts/hindi.txt");
}

fn word_bounds_japanese(c: &mut Criterion) {
word_bounds(c, "japanese", "benches/texts/japanese.txt");
}

fn word_bounds_korean(c: &mut Criterion) {
word_bounds(c, "korean", "benches/texts/korean.txt");
}

fn word_bounds_mandarin(c: &mut Criterion) {
word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
}

fn word_bounds_russian(c: &mut Criterion) {
word_bounds(c, "russian", "benches/texts/russian.txt");
}

fn word_bounds_source_code(c: &mut Criterion) {
word_bounds(c, "source_code", "benches/texts/source_code.txt");
}

criterion_group!(
benches,
word_bounds_arabic,
word_bounds_english,
word_bounds_hindi,
word_bounds_japanese,
word_bounds_korean,
word_bounds_mandarin,
word_bounds_russian,
word_bounds_source_code,
);

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.split_word_bounds() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("word_bounds");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
59 changes: 59 additions & 0 deletions benches/words.rs
@@ -0,0 +1,59 @@
//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
//! scalar-based `std::str::split_whitespace`.
//!
//! It is expected that `std::str::split_whitespace` is faster than
//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
//! clusters. The question in this benchmark is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_words() {
black_box(w);
}
}

#[inline(always)]
fn scalar(text: &str) {
for w in text.split_whitespace() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("words");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);