Skip to content

Commit

Permalink
Merge pull request #133 from Jonas-Heinrich/master
Browse files Browse the repository at this point in the history
Modify benchmarks to compare against stdlib functions
  • Loading branch information
Manishearth committed Apr 16, 2024
2 parents baab923 + d6a6eb8 commit 3ff9de6
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 184 deletions.
7 changes: 3 additions & 4 deletions Cargo.toml
Expand Up @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.

[dev-dependencies]
quickcheck = "0.7"
criterion = "0.3"
criterion = "0.5"

[[bench]]
name = "graphemes"
name = "chars"
harness = false

[[bench]]
name = "unicode_words"
name = "words"
harness = false

[[bench]]
name = "word_bounds"
harness = false

60 changes: 60 additions & 0 deletions benches/chars.rs
@@ -0,0 +1,60 @@
//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
//! `std::str::chars`.
//!
//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
//! does not consider the complexity of grapheme clusters. The question in this benchmark
//! is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
use unicode_segmentation;

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
black_box(c);
}
}

#[inline(always)]
fn scalar(text: &str) {
for c in black_box(&*text).chars() {
black_box(c);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("chars");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
63 changes: 0 additions & 63 deletions benches/graphemes.rs

This file was deleted.

61 changes: 0 additions & 61 deletions benches/unicode_words.rs

This file was deleted.

88 changes: 32 additions & 56 deletions benches/word_bounds.rs
@@ -1,61 +1,37 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
let text = fs::read_to_string(path).unwrap();
c.bench_function(&format!("word_bounds_{}", lang), |bench| {
bench.iter(|| {
for w in text.split_word_bounds() {
black_box(w);
}
});
});
}

fn word_bounds_arabic(c: &mut Criterion) {
word_bounds(c, "arabic", "benches/texts/arabic.txt");
}

fn word_bounds_english(c: &mut Criterion) {
word_bounds(c, "english", "benches/texts/english.txt");
}

fn word_bounds_hindi(c: &mut Criterion) {
word_bounds(c, "hindi", "benches/texts/hindi.txt");
}

fn word_bounds_japanese(c: &mut Criterion) {
word_bounds(c, "japanese", "benches/texts/japanese.txt");
}

fn word_bounds_korean(c: &mut Criterion) {
word_bounds(c, "korean", "benches/texts/korean.txt");
}

fn word_bounds_mandarin(c: &mut Criterion) {
word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
}

fn word_bounds_russian(c: &mut Criterion) {
word_bounds(c, "russian", "benches/texts/russian.txt");
}

fn word_bounds_source_code(c: &mut Criterion) {
word_bounds(c, "source_code", "benches/texts/source_code.txt");
}

criterion_group!(
benches,
word_bounds_arabic,
word_bounds_english,
word_bounds_hindi,
word_bounds_japanese,
word_bounds_korean,
word_bounds_mandarin,
word_bounds_russian,
word_bounds_source_code,
);

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.split_word_bounds() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("word_bounds");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);
59 changes: 59 additions & 0 deletions benches/words.rs
@@ -0,0 +1,59 @@
//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
//! scalar-based `std::str::split_whitespace`.
//!
//! It is expected that `std::str::split_whitespace` is faster than
//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
//! clusters. The question in this benchmark is how much slower full unicode handling is.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};

use std::fs;
use unicode_segmentation::UnicodeSegmentation;

const FILES: &[&str] = &[
"arabic",
"english",
"hindi",
"japanese",
"korean",
"mandarin",
"russian",
"source_code",
];

#[inline(always)]
fn grapheme(text: &str) {
for w in text.unicode_words() {
black_box(w);
}
}

#[inline(always)]
fn scalar(text: &str) {
for w in text.split_whitespace() {
black_box(w);
}
}

fn bench_all(c: &mut Criterion) {
let mut group = c.benchmark_group("words");

for file in FILES {
group.bench_with_input(
BenchmarkId::new("grapheme", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| grapheme(content)),
);
}

for file in FILES {
group.bench_with_input(
BenchmarkId::new("scalar", file),
&fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
|b, content| b.iter(|| scalar(content)),
);
}
}

criterion_group!(benches, bench_all);
criterion_main!(benches);

0 comments on commit 3ff9de6

Please sign in to comment.