diff --git a/editdistance/__init__.pxd b/editdistance/__init__.pxd index 1fcf681..2f0683b 100644 --- a/editdistance/__init__.pxd +++ b/editdistance/__init__.pxd @@ -1,2 +1,2 @@ # cython: language_level=3 -from editdistance.bycython cimport eval +from editdistance.bycython cimport eval, eval_criterion diff --git a/editdistance/__init__.py b/editdistance/__init__.py index 7ddd27e..ce3a0f6 100644 --- a/editdistance/__init__.py +++ b/editdistance/__init__.py @@ -1,9 +1,12 @@ -from .bycython import eval +from .bycython import eval, eval_criterion def distance(*args, **kwargs): """"An alias to eval""" return eval(*args, **kwargs) +def distance_le_than(*args, **kwargs): + """"An alias to eval""" + return eval_criterion(*args, **kwargs) -__all__ = ('eval', 'distance') +__all__ = ('eval', 'distance', "eval_criterion", "distance_le_than") diff --git a/editdistance/_editdistance.cpp b/editdistance/_editdistance.cpp index 9769466..db5e710 100644 --- a/editdistance/_editdistance.cpp +++ b/editdistance/_editdistance.cpp @@ -41,7 +41,7 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1LL << i); for(size_t i = 0; i < vecsize; ++i) { TVALUE &PM = cmap[vec[i]]; - for(int r = 0; r <= tmax; ++r) { + for(unsigned int r = 0; r <= tmax; ++r) { uint64_t X = PM[r]; if(r > 0 && (HN[r - 1] & lmb)) X |= 1LL; D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r]; @@ -76,6 +76,28 @@ unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t c return d[size1&1][size2]; } +template +bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t const size2, unsigned int const thr) { + vector< vector > d(2, vector(size2 + 1)); + d[0][0] = 0; + d[1][0] = 1; + for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i; + for (size_t i = 1; i < size1 + 1; i++) { + d[i&1][0] = d[(i-1)&1][0] + 1; + bool below_thr = false; + for (size_t j = 1; j < size2 + 1; j++) { + d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1)); + if (d[i%1][j] <= thr) { + below_thr = true; + } + } + if (!below_thr) { + return false; + } + } + return d[size1&1][size2] <= thr; +} + template struct varr { uint64_t arr_[N]; @@ -129,3 +151,28 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep); return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる } + +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) { + if(asize == 0) return bsize <= thr; + if(bsize == 0) return asize <= thr; + // 要素数の大きいほうがa + int64_t const *ap, *bp; + unsigned int const *asizep, *bsizep; + if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize; + else ap = a, bp = b, asizep = &asize, bsizep = &bsize; + // 必要な配列サイズを調べる + size_t vsize = ((*asizep - 1) >> 6) + 1; // 64までは1, 128までは2, ... + // bit-parallelでできそうな限界を超えたら要素数の小さい方をaとする。 + if(vsize > 10) { + int64_t const *_ = ap; + unsigned int const *__ = asizep; + ap = bp, bp = _, asizep = bsizep, bsizep = __; + vsize = ((*asizep - 1) >> 6) + 1; + } + + return edit_distancec_dp(ap, *asizep, bp, *bsizep, thr); // dynamic programmingに任せる +} + + + + diff --git a/editdistance/_editdistance.h b/editdistance/_editdistance.h index ac5f71d..d590b7e 100644 --- a/editdistance/_editdistance.h +++ b/editdistance/_editdistance.h @@ -8,6 +8,7 @@ extern "C" { #endif unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize); +bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr); unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2); #ifdef __cplusplus diff --git a/editdistance/bycython.pxd b/editdistance/bycython.pxd index 9d0758e..4734ce8 100644 --- a/editdistance/bycython.pxd +++ b/editdistance/bycython.pxd @@ -1,2 +1,3 @@ # cython: language_level=3 cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff \ No newline at end of file diff --git a/editdistance/bycython.pyx b/editdistance/bycython.pyx index 258a03a..9008a32 100644 --- a/editdistance/bycython.pyx +++ b/editdistance/bycython.pyx @@ -2,11 +2,13 @@ # distutils: sources = editdistance/_editdistance.cpp from libc.stdlib cimport malloc, free +from libcpp cimport bool # from libc.stdint cimport int64_t cdef extern from "./_editdistance.h": ctypedef int int64_t unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize) + bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2) @@ -22,9 +24,22 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff: free(al) free(bl) return dist + +cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff: + cdef unsigned int i + cdef bint ret + cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) + for i in range(len(a)): + al[i] = hash(a[i]) + cdef int64_t *bl = malloc(len(b) * sizeof(int64_t)) + for i in range(len(b)): + bl[i] = hash(b[i]) + ret = edit_distance_criterion(al, len(a), bl, len(b), thr) + free(al) + free(bl) + return ret - -cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: + cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: cdef unsigned int i, dist cdef int64_t *al = malloc(len(a) * sizeof(int64_t)) for i in range(len(a)): @@ -35,4 +50,4 @@ cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff: dist = edit_distance_dp(al, len(a), bl, len(b)) free(al) free(bl) - return dist + return dist \ No newline at end of file diff --git a/setup.py b/setup.py index 2e3471a..78d2444 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ setup( name="editdistance", - version="0.6.2", + version="0.7.0", python_requires=">=3.6", description="Fast implementation of the edit distance(Levenshtein distance)", long_description=readme, diff --git a/test/test_editdistance.py b/test/test_editdistance.py index 2dc6242..ec2244c 100644 --- a/test/test_editdistance.py +++ b/test/test_editdistance.py @@ -7,6 +7,11 @@ def test_editdistance(self): import editdistance self.assertEqual(1, editdistance.eval('abc', 'aec')) + def test_editdistance_criterion(self): + import editdistance + self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1)) + self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1)) + def test_dp_editdistance(self): from editdistance.bycython import eval_dp self.assertEqual(3, eval_dp('bbb', 'a')) @@ -20,7 +25,7 @@ def test_dp_vs_default(self): seq2 = random.choices([0, 1, 2], k=random.randint(10, 50)) self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2)) - + if __name__ == '__main__': unittest.main() \ No newline at end of file