From a33f4644b2256c89d38f895689463b8a4d5abb41 Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Mon, 24 Apr 2023 10:54:45 +0200 Subject: [PATCH 1/3] expose edit_distance_dp and add tests - Removed template code: - Only one value for the template is used (T=int64_t) - In cython it is easier to use non-template code - Fix warning: comparison of integer expressions of different signedness - --- editdistance/_editdistance.cpp | 11 +++++------ editdistance/_editdistance.h | 1 + editdistance/bycython.pyx | 16 ++++++++++++++++ test/test_editdistance.py | 16 ++++++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index 220ddb7..dbb16a8 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -61,15 +61,14 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz /// c.f. http://handasse.blogspot.com/2009/04/c_29.html -template -unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) { +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2) { // vectorより固定長配列の方が速いが、文字列が長い時の保険でのみ呼ばれるのでサイズを決め打ちできない vector< vector > d(2, vector(size2 + 1)); d[0][0] = 0; d[1][0] = 1; - for (int i = 0; i < size2 + 1; i++) d[0][i] = i; - for (int i = 1; i < size1 + 1; i++) { - for (int j = 1; j < size2 + 1; j++) { + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } } @@ -127,5 +126,5 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep); else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep); else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); - return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる + return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index 2671d00..ac5f71d 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -8,6 +8,7 @@ extern "C" { #endif unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2); #ifdef __cplusplus } diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index d64a67a..258a03a 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -7,6 +7,8 @@ from libc.stdlib cimport malloc, free cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) + unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2) + cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: cdef unsigned int i, dist @@ -20,3 +22,17 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist + + +cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: + cdef unsigned int i, dist + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + dist = edit_distance_dp(al, len(a), bl, len(b)) + free(al) + free(bl) + return dist diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 57ed1f3..2dc6242 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -1,9 +1,25 @@ import unittest +import random + class TestEditDistance(unittest.TestCase): def test_editdistance(self): import editdistance self.assertEqual(1, editdistance.eval('abc', 'aec')) + + def test_dp_editdistance(self): + from editdistance.bycython import eval_dp + self.assertEqual(3, eval_dp('bbb', 'a')) + self.assertEqual(3, eval_dp('a', 'bbb')) + + def test_dp_vs_default(self): + for _ in range(10): + import editdistance + from editdistance.bycython import eval_dp + seq1 = random.choices([0, 1, 2], k=random.randint(10, 50)) + seq2 = random.choices([0, 1, 2], k=random.randint(10, 50)) + + self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2)) if __name__ == '__main__': From 6f3fbe4fb63c81b52acff438d8de5340b37e0c2e Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Mon, 24 Apr 2023 10:54:53 +0200 Subject: [PATCH 2/3] fix edit_distance_dp --- editdistance/_editdistance.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index dbb16a8..9769466 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -68,6 +68,7 @@ unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t c d[1][0] = 1; for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; for (size_t i = 1; i < size1 + 1; i++) { + d[i&1][0] = d[(i-1)&1][0] + 1; for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } From e36676adae51c134c568aadecafa9755f801214a Mon Sep 17 00:00:00 2001 From: vlemonidis Date: Mon, 22 Jan 2024 15:35:57 +0000 Subject: [PATCH 3/3] Added ability to prematurely terminate search if edit distance above a given threshold --- editdistance/__init__.pxd | 2 +- editdistance/__init__.py | 7 +++-- editdistance/_editdistance.cpp | 54 +++++++++++++++++++++++++++++++--- editdistance/_editdistance.h | 1 + editdistance/bycython.pxd | 1 + editdistance/bycython.pyx | 17 +++++++++++ setup.py | 2 +- test/test_editdistance.py | 6 +++- 8 files changed, 81 insertions(+), 9 deletions(-) diff --git a/editdistance/__init__.pxd b/editdistance/__init__.pxd index 1fcf681..2f0683b 100644 --- a/editdistance/__init__.pxd +++ b/editdistance/__init__.pxd @@ -1,2 +1,2 @@ # cython: language_level=3 -from editdistance.bycython cimport eval +from editdistance.bycython cimport eval, eval_criterion diff --git a/editdistance/__init__.py b/editdistance/__init__.py index 7ddd27e..ce3a0f6 100644 --- a/editdistance/__init__.py +++ b/editdistance/__init__.py @@ -1,9 +1,12 @@ -from .bycython import eval +from .bycython import eval, eval_criterion def distance(*args, **kwargs): """"An alias to eval""" return eval(*args, **kwargs) +def distance_le_than(*args, **kwargs): + """"An alias to eval""" + return eval_criterion(*args, **kwargs) -__all__ = ('eval', 'distance') +__all__ = ('eval', 'distance', "eval_criterion", "distance_le_than") diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index 220ddb7..e9c2bf1 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -41,7 +41,7 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1LL << i); for(size_t i = 0; i < vecsize; ++i) { TVALUE &PM = cmap[vec[i]]; - for(int r = 0; r <= tmax; ++r) { + for(unsigned int r = 0; r <= tmax; ++r) { uint64_t X = PM[r]; if(r > 0 && (HN[r - 1] & lmb)) X |= 1LL; D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r]; @@ -67,15 +67,36 @@ unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, vector< vector > d(2, vector(size2 + 1)); d[0][0] = 0; d[1][0] = 1; - for (int i = 0; i < size2 + 1; i++) d[0][i] = i; - for (int i = 1; i < size1 + 1; i++) { - for (int j = 1; j < size2 + 1; j++) { + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } } return d[size1&1][size2]; } +template +bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t const size2, unsigned int const thr) { + vector< vector > d(2, vector(size2 + 1)); + d[0][0] = 0; + d[1][0] = 1; + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + bool below_thr = false; + for (size_t j = 1; j < size2 + 1; j++) { + d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); + if (d[i%1][j] <= thr) { + below_thr = true; + } + } + if (!below_thr) { + return false; + } + } + return d[size1&1][size2] <= thr; +} + template struct varr { uint64_t arr_[N]; @@ -129,3 +150,28 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } + +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) { + if(asize == 0) return bsize <= thr; + if(bsize == 0) return asize <= thr; + // 要素数の大きいほうがa + int64_t const *ap, *bp; + unsigned int const *asizep, *bsizep; + if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize; + else ap = a, bp = b, asizep = &asize, bsizep = &bsize; + // 必要な配列サイズを調べる + size_t vsize = ((*asizep - 1) >> 6) + 1; // 64までは1, 128までは2, ... + // bit-parallelでできそうな限界を超えたら要素数の小さい方をaとする。 + if(vsize > 10) { + int64_t const *_ = ap; + unsigned int const *__ = asizep; + ap = bp, bp = _, asizep = bsizep, bsizep = __; + vsize = ((*asizep - 1) >> 6) + 1; + } + + return edit_distancec_dp(ap, *asizep, bp, *bsizep, thr); // dynamic programmingに任せる +} + + + + diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index 2671d00..6cf03ff 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -8,6 +8,7 @@ extern "C" { #endif unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr); #ifdef __cplusplus } diff --git a/editdistance/bycython.pxd b/editdistance/bycython.pxd index 9d0758e..4734ce8 100644 --- a/editdistance/bycython.pxd +++ b/editdistance/bycython.pxd @@ -1,2 +1,3 @@ # cython: language_level=3 cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff \ No newline at end of file diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index d64a67a..d49ed91 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -2,11 +2,14 @@ # distutils: sources = editdistance/_editdistance.cpp from libc.stdlib cimport malloc, free +from libcpp cimport bool # from libc.stdint cimport int64_t cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) + bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) + cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: cdef unsigned int i, dist @@ -20,3 +23,17 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist + +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff: + cdef unsigned int i + cdef bint ret + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + ret = edit_distance_criterion(al, len(a), bl, len(b), thr) + free(al) + free(bl) + return ret diff --git a/setup.py b/setup.py index 2e3471a..78d2444 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ setup( name="editdistance", - version="0.6.2", + version="0.7.0", python_requires=">=3.6", description="Fast implementation of the edit distance(Levenshtein distance)", long_description=readme, diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 57ed1f3..0128feb 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -4,7 +4,11 @@ class TestEditDistance(unittest.TestCase): def test_editdistance(self): import editdistance self.assertEqual(1, editdistance.eval('abc', 'aec')) - + + def test_editdistance_criterion(self): + import editdistance + self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1)) + self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1)) if __name__ == '__main__': unittest.main() \ No newline at end of file