From a33f4644b2256c89d38f895689463b8a4d5abb41 Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Mon, 24 Apr 2023 10:54:45 +0200 Subject: [PATCH 1/2] expose edit_distance_dp and add tests - Removed template code: - Only one value for the template is used (T=int64_t) - In cython it is easier to use non-template code - Fix warning: comparison of integer expressions of different signedness - --- editdistance/_editdistance.cpp | 11 +++++------ editdistance/_editdistance.h | 1 + editdistance/bycython.pyx | 16 ++++++++++++++++ test/test_editdistance.py | 16 ++++++++++++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index 220ddb7..dbb16a8 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -61,15 +61,14 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz /// c.f. http://handasse.blogspot.com/2009/04/c_29.html -template -unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) { +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2) { // vectorより固定長配列の方が速いが、文字列が長い時の保険でのみ呼ばれるのでサイズを決め打ちできない vector< vector > d(2, vector(size2 + 1)); d[0][0] = 0; d[1][0] = 1; - for (int i = 0; i < size2 + 1; i++) d[0][i] = i; - for (int i = 1; i < size1 + 1; i++) { - for (int j = 1; j < size2 + 1; j++) { + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); } } @@ -127,5 +126,5 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep); else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep); else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); - return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる + return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index 2671d00..ac5f71d 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -8,6 +8,7 @@ extern "C" { #endif unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); +unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2); #ifdef __cplusplus } diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index d64a67a..258a03a 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -7,6 +7,8 @@ from libc.stdlib cimport malloc, free cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) + unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2) + cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: cdef unsigned int i, dist @@ -20,3 +22,17 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist + + +cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: + cdef unsigned int i, dist + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + dist = edit_distance_dp(al, len(a), bl, len(b)) + free(al) + free(bl) + return dist diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 57ed1f3..2dc6242 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -1,9 +1,25 @@ import unittest +import random + class TestEditDistance(unittest.TestCase): def test_editdistance(self): import editdistance self.assertEqual(1, editdistance.eval('abc', 'aec')) + + def test_dp_editdistance(self): + from editdistance.bycython import eval_dp + self.assertEqual(3, eval_dp('bbb', 'a')) + self.assertEqual(3, eval_dp('a', 'bbb')) + + def test_dp_vs_default(self): + for _ in range(10): + import editdistance + from editdistance.bycython import eval_dp + seq1 = random.choices([0, 1, 2], k=random.randint(10, 50)) + seq2 = random.choices([0, 1, 2], k=random.randint(10, 50)) + + self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2)) if __name__ == '__main__': From 6f3fbe4fb63c81b52acff438d8de5340b37e0c2e Mon Sep 17 00:00:00 2001 From: Christoph Boeddeker Date: Mon, 24 Apr 2023 10:54:53 +0200 Subject: [PATCH 2/2] fix edit_distance_dp --- editdistance/_editdistance.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index dbb16a8..9769466 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -68,6 +68,7 @@ unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t c d[1][0] = 1; for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; for (size_t i = 1; i < size1 + 1; i++) { + d[i&1][0] = d[(i-1)&1][0] + 1; for (size_t j = 1; j < size2 + 1; j++) { d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); }