Skip to content

Commit

Permalink
Merge pull request #2 from emattiza/chore/add-benchmarks
Browse files Browse the repository at this point in the history
update: v0.1.0 -> v0.1.1
  • Loading branch information
emattiza committed Apr 26, 2023
2 parents 5ad9719 + cc166a3 commit 5e3d12a
Show file tree
Hide file tree
Showing 8 changed files with 1,601 additions and 9 deletions.
10 changes: 5 additions & 5 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "rs_chardet"
version = "0.1.0"
version = "0.1.1"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,19 @@

This is a thin wrapper utilizing [chardet-ng] for character encoding detection.

## Benchmarking
### Results

| Library | Version | Rate (call(s)/s) |
|------------|---------|---------------------|
| chardet | v5.1.0 | 0.20407073211428026 |
| rs_chardet | v0.1.1 | 25.446704726774133 |
| cchardet | v2.1.18 | 917.8711484593837 |

### Benchmark System Information:
[benchmark-py]
Hetzner CPX31 (4vCPU, 8GB Ram) in the US


[chardet-ng]: https://github.com/hsivonen/chardetng
[benchmark-py]: ./benchmark/bench.py
51 changes: 51 additions & 0 deletions benchmark/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from io import TextIOBase
import sys
import time
from typing import Any, Callable, Optional
from pathlib import Path


def benchmark_impl(
msg: bytes,
detector: Callable[[Any], Any],
n_calls: int,
module: Any,
output_buf: Optional[TextIOBase] = None,
):
result = 0
for _ in range(n_calls):
start = time.time()
detector(msg)
result += time.time() - start
print(
"%s v%s:" % (module.__name__, module.__version__),
1 / (result / n_calls),
"call(s)/s",
file=(output_buf or sys.stdout),
)


def main():
import chardet
import cchardet
import rs_chardet

do_times = 5
path = (
Path(__file__).parent
) / "samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt"
with path.open("rb") as f:
msg = f.read()

detector_chardet = lambda msg: chardet.detect(msg)
detector_rschardet = lambda msg: rs_chardet.detect_rs_enc_name(msg)
detector_cchardet = lambda msg: cchardet.detect(msg)

# Test chardet
benchmark_impl(msg, detector_chardet, do_times, chardet, None)
benchmark_impl(msg, detector_rschardet, do_times, rs_chardet, None)
benchmark_impl(msg, detector_cchardet, do_times, cchardet, None)


if __name__ == "__main__":
main()
1,522 changes: 1,522 additions & 0 deletions benchmark/samples/wikipediaJa_One_Thousand_and_One_Nights_SJIS.txt

Large diffs are not rendered by default.

7 changes: 5 additions & 2 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
overlays = [inputs.rust-overlay.overlays.default];
};
project_name = "rs_chardet";
project_version = "0.1.0";
project_version = "0.1.1";
python_version = pkgs.python310;
buildPythonPackage = pkgs.python310Packages.buildPythonPackage;
in rec {
Expand All @@ -25,14 +25,17 @@
default = packages.pythonpkg;
pythonpkg = python_version.withPackages (ps: [
lib.python_module
ps.chardet
ps.cchardet
]);
};
devShells.default = pkgs.mkShell {
buildInputs = [
pkgs.rustc
pkgs.rust-analyzer
pkgs.cargo
python_version
pkgs.maturin
packages.pythonpkg
];
};
lib = {
Expand Down
2 changes: 1 addition & 1 deletion nix/package.nix
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ buildPythonPackage rec {
cargoDeps = rustPlatform.fetchCargoTarball {
inherit src;
name = "${pname}-${version}";
hash = "sha256-JAS0TYnxXaRzPXLksW1MKKmPk+20HRzDzvGDLuMXbUM=";
hash = "sha256-phmxYqJ7fWZHJH1BI7XEymqXK+Mchd37scEGTy/mLZk=";
};

nativeBuildInputs = with rustPlatform; [cargoSetupHook maturinBuildHook];
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ fn detect_codec(a: &[u8]) -> PyResult<PyObject> {
/// A Python module implemented in Rust.
#[pymodule]
fn rs_chardet(_py: Python, m: &PyModule) -> PyResult<()> {
m.add("__version__", env!("CARGO_PKG_VERSION"))?;
m.add_function(wrap_pyfunction!(detect_rs_enc_name, m)?)?;
m.add_function(wrap_pyfunction!(detect_codec, m)?)?;
Ok(())
Expand Down

0 comments on commit 5e3d12a

Please sign in to comment.