Skip to content

Commit

Permalink
Merge branch 'master' into use-pdm
Browse files Browse the repository at this point in the history
  • Loading branch information
roy-ht committed Feb 10, 2024
2 parents b50a564 + 8b61734 commit 9aa9947
Show file tree
Hide file tree
Showing 8 changed files with 175 additions and 11 deletions.
59 changes: 59 additions & 0 deletions setup.py
@@ -0,0 +1,59 @@
"""
-------
License
-------
It is released under the MIT license.
Copyright (c) 2013 Hiroyuki Tanaka
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
"""


try:
from setuptools import Extension, setup
except:
from distutils import Extension, setup

from Cython.Build import cythonize

ext_modules = cythonize("editdistance/bycython.pyx")

with open("README.rst") as readme_file:
readme = readme_file.read()

setup(
name="editdistance",
version="0.7.0",
python_requires=">=3.6",
description="Fast implementation of the edit distance(Levenshtein distance)",
long_description=readme,
long_description_content_type="text/x-rst",
author="Hiroyuki Tanaka",
author_email="aflc0x@gmail.com",
url="https://www.github.com/roy-ht/editdistance",
ext_modules=ext_modules,
packages=["editdistance"],
package_data={
"editdistance": ["__init__.pxd", "_editdistance.h", "bycython.pxd", "def.h"]
},
classifiers=[
"License :: OSI Approved :: MIT License",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: Implementation :: CPython",
"Programming Language :: Python :: Implementation :: PyPy",
],
)
2 changes: 1 addition & 1 deletion src/editdistance/__init__.pxd
@@ -1,2 +1,2 @@
# cython: language_level=3
from editdistance.bycython cimport eval
from editdistance.bycython cimport eval, eval_criterion
7 changes: 5 additions & 2 deletions src/editdistance/__init__.py
@@ -1,9 +1,12 @@
from .bycython import eval
from .bycython import eval, eval_criterion


def distance(*args, **kwargs):
""""An alias to eval"""
return eval(*args, **kwargs)

def distance_le_than(*args, **kwargs):
""""An alias to eval"""
return eval_criterion(*args, **kwargs)

__all__ = ('eval', 'distance')
__all__ = ('eval', 'distance', "eval_criterion", "distance_le_than")
61 changes: 54 additions & 7 deletions src/editdistance/_editdistance.cpp
Expand Up @@ -41,7 +41,7 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz
for(size_t i = 0; i < tlen; ++i) VP[tmax] |= (1LL << i);
for(size_t i = 0; i < vecsize; ++i) {
TVALUE &PM = cmap[vec[i]];
for(int r = 0; r <= tmax; ++r) {
for(unsigned int r = 0; r <= tmax; ++r) {
uint64_t X = PM[r];
if(r > 0 && (HN[r - 1] & lmb)) X |= 1LL;
D0[r] = (((X & VP[r]) + VP[r]) ^ VP[r]) | X | VN[r];
Expand All @@ -61,21 +61,43 @@ unsigned int edit_distance_bpv(T &cmap, int64_t const *vec, size_t const &vecsiz


/// c.f. http://handasse.blogspot.com/2009/04/c_29.html
template<typename T>
unsigned int edit_distance_dp(T const *str1, size_t const size1, T const *str2, size_t const size2) {
unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2) {
// vectorより固定長配列の方が速いが、文字列が長い時の保険でのみ呼ばれるのでサイズを決め打ちできない
vector< vector<uint32_t> > d(2, vector<uint32_t>(size2 + 1));
d[0][0] = 0;
d[1][0] = 1;
for (int i = 0; i < size2 + 1; i++) d[0][i] = i;
for (int i = 1; i < size1 + 1; i++) {
for (int j = 1; j < size2 + 1; j++) {
for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i;
for (size_t i = 1; i < size1 + 1; i++) {
d[i&1][0] = d[(i-1)&1][0] + 1;
for (size_t j = 1; j < size2 + 1; j++) {
d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1));
}
}
return d[size1&1][size2];
}

template<typename T>
bool edit_distancec_dp(T const *str1, size_t const size1, T const *str2, size_t const size2, unsigned int const thr) {
vector< vector<uint32_t> > d(2, vector<uint32_t>(size2 + 1));
d[0][0] = 0;
d[1][0] = 1;
for (size_t i = 0; i < size2 + 1; i++) d[0][i] = i;
for (size_t i = 1; i < size1 + 1; i++) {
d[i&1][0] = d[(i-1)&1][0] + 1;
bool below_thr = false;
for (size_t j = 1; j < size2 + 1; j++) {
d[i&1][j] = min(min(d[(i-1)&1][j], d[i&1][j-1]) + 1, d[(i-1)&1][j-1] + (str1[i-1] == str2[j-1] ? 0 : 1));
if (d[i%1][j] <= thr) {
below_thr = true;
}
}
if (!below_thr) {
return false;
}
}
return d[size1&1][size2] <= thr;
}

template <size_t N>
struct varr {
uint64_t arr_[N];
Expand Down Expand Up @@ -127,5 +149,30 @@ unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int
else if(vsize == 8) return edit_distance_map_<8>(ap, *asizep, bp, *bsizep);
else if(vsize == 9) return edit_distance_map_<9>(ap, *asizep, bp, *bsizep);
else if(vsize == 10) return edit_distance_map_<10>(ap, *asizep, bp, *bsizep);
return edit_distance_dp<int64_t>(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる
return edit_distance_dp(ap, *asizep, bp, *bsizep); // dynamic programmingに任せる
}

bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr) {
if(asize == 0) return bsize <= thr;
if(bsize == 0) return asize <= thr;
// 要素数の大きいほうがa
int64_t const *ap, *bp;
unsigned int const *asizep, *bsizep;
if(asize < bsize) ap = b, bp = a, asizep = &bsize, bsizep = &asize;
else ap = a, bp = b, asizep = &asize, bsizep = &bsize;
// 必要な配列サイズを調べる
size_t vsize = ((*asizep - 1) >> 6) + 1; // 64までは1, 128までは2, ...
// bit-parallelでできそうな限界を超えたら要素数の小さい方をaとする。
if(vsize > 10) {
int64_t const *_ = ap;
unsigned int const *__ = asizep;
ap = bp, bp = _, asizep = bsizep, bsizep = __;
vsize = ((*asizep - 1) >> 6) + 1;
}

return edit_distancec_dp<int64_t>(ap, *asizep, bp, *bsizep, thr); // dynamic programmingに任せる
}




2 changes: 2 additions & 0 deletions src/editdistance/_editdistance.h
Expand Up @@ -8,6 +8,8 @@ extern "C" {
#endif

unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize);
bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr);
unsigned int edit_distance_dp(int64_t const *str1, size_t const size1, int64_t const *str2, size_t const size2);

#ifdef __cplusplus
}
Expand Down
1 change: 1 addition & 0 deletions src/editdistance/bycython.pxd
@@ -1,2 +1,3 @@
# cython: language_level=3
cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff
cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff
31 changes: 31 additions & 0 deletions src/editdistance/bycython.pyx
Expand Up @@ -2,11 +2,15 @@
# distutils: sources = src/editdistance/_editdistance.cpp

from libc.stdlib cimport malloc, free
from libcpp cimport bool
# from libc.stdint cimport int64_t

cdef extern from "./_editdistance.h":
ctypedef int int64_t
unsigned int edit_distance(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize)
bool edit_distance_criterion(const int64_t *a, const unsigned int asize, const int64_t *b, const unsigned int bsize, const unsigned int thr)
unsigned int edit_distance_dp(const int64_t *str1, const size_t size1, const int64_t *str2, const size_t size2)


cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff:
cdef unsigned int i, dist
Expand All @@ -20,3 +24,30 @@ cpdef unsigned int eval(object a, object b) except 0xffffffffffffffff:
free(al)
free(bl)
return dist

cpdef bint eval_criterion(object a, object b, const unsigned int thr) except 0xffffffffffffffff:
cdef unsigned int i
cdef bint ret
cdef int64_t *al = <int64_t *>malloc(len(a) * sizeof(int64_t))
for i in range(len(a)):
al[i] = hash(a[i])
cdef int64_t *bl = <int64_t *>malloc(len(b) * sizeof(int64_t))
for i in range(len(b)):
bl[i] = hash(b[i])
ret = edit_distance_criterion(al, len(a), bl, len(b), thr)
free(al)
free(bl)
return ret

cpdef unsigned int eval_dp(object a, object b) except 0xffffffffffffffff:
cdef unsigned int i, dist
cdef int64_t *al = <int64_t *>malloc(len(a) * sizeof(int64_t))
for i in range(len(a)):
al[i] = hash(a[i])
cdef int64_t *bl = <int64_t *>malloc(len(b) * sizeof(int64_t))
for i in range(len(b)):
bl[i] = hash(b[i])
dist = edit_distance_dp(al, len(a), bl, len(b))
free(al)
free(bl)
return dist
23 changes: 22 additions & 1 deletion test/test_editdistance.py
@@ -1,10 +1,31 @@
import unittest
import random


class TestEditDistance(unittest.TestCase):
def test_editdistance(self):
import editdistance
self.assertEqual(1, editdistance.eval('abc', 'aec'))


def test_editdistance_criterion(self):
import editdistance
self.assertEqual(False, editdistance.eval_criterion('abcb', 'aeca', 1))
self.assertEqual(True, editdistance.eval_criterion('abc', 'aec', 1))

def test_dp_editdistance(self):
from editdistance.bycython import eval_dp
self.assertEqual(3, eval_dp('bbb', 'a'))
self.assertEqual(3, eval_dp('a', 'bbb'))

def test_dp_vs_default(self):
for _ in range(10):
import editdistance
from editdistance.bycython import eval_dp
seq1 = random.choices([0, 1, 2], k=random.randint(10, 50))
seq2 = random.choices([0, 1, 2], k=random.randint(10, 50))

self.assertEqual(editdistance.eval(seq1, seq2), eval_dp(seq1, seq2))


if __name__ == '__main__':
unittest.main()

0 comments on commit 9aa9947

Please sign in to comment.