Skip to content

Commit

Permalink
Merge pull request #127 from INGEOTEC/develop
Browse files Browse the repository at this point in the history
Version - 2.0.4
  • Loading branch information
mgraffg committed Jan 12, 2024
2 parents 6984039 + 47931d9 commit 1054a58
Show file tree
Hide file tree
Showing 4 changed files with 108 additions and 3 deletions.
2 changes: 1 addition & 1 deletion EvoMSA/__init__.py
Expand Up @@ -11,7 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = '2.0.3'
__version__ = '2.0.4'

try:
from EvoMSA.text_repr import BoW, TextRepresentations, StackGeneralization, DenseBoW
Expand Down
23 changes: 22 additions & 1 deletion EvoMSA/tests/test_text_repr.py
Expand Up @@ -612,4 +612,25 @@ def test_DenseBoW_tailored():
n_jobs=-1)
assert len(dense.names) == 0
dense.text_representations_extend('IberLEF2023_DAVINCIS_task1')
assert len(dense.names)
assert len(dense.names)


def test_unique():
"""Test unique function"""
from EvoMSA.text_repr import unique

D = ['hola buenos dias', 'peticion', 'la vida', 'peticion',
'adios colegas', 'la vida', 'comiendo en el salón', 'adios colegas',
'nuevo', 'la vida', 'nuevo', 'hola buenos dias']
actual = np.array([0, 1, 2, 4, 6, 8])
index = unique(D, lang='es', return_index=True, batch_size=4)
assert np.all(index == actual)
index = unique(D, G=['hola buenos dias'],
lang='es', return_index=True, batch_size=4)
assert index[0] != 0
index = unique(D, lang='es', return_index=False, batch_size=4)
assert isinstance(index, list) and len(index) == actual.shape[0]
D = list(tweet_iterator(TWEETS))
D[-1] = D[0]
index = unique(D, lang='es', batch_size=11, return_index=True)
assert index[-1] == 998
83 changes: 82 additions & 1 deletion EvoMSA/text_repr.py
Expand Up @@ -1058,4 +1058,85 @@ def __sklearn_clone__(self):
for x in self.decision_function_models]
params['transform_models'] = [clone(x)
for x in self.transform_models]
return klass(**params)
return klass(**params)


def unique(D: List[Union[dict, list]],
G: List[Union[dict, list]]=None,
lang: str='es',
return_index: bool=False,
alpha: float=0.95,
batch_size: int=1024,
bow_params: dict=None):
"""Compute the unique elements in a set using :py:class:`~EvoMSA.text_repr.BoW`
:param D: Texts; in the case, it is a list of dictionaries the text is on the key :py:attr:`BoW.key`
:type D: List of texts or dictionaries.
:param G: D must be different than G.
:type G: List of texts or dictionaries.
:param lang: Language.
:type lang: str
:param return_index: Return the indexes.
:type return_index: bool
:param alpha: Value to assert similarity.
:type alpha: float
:param batch_size: Batch size
:type batch_size: int
:param bow_params: :py:class:`~EvoMSA.text_repr.BoW` params.
:type bow_params: dict
>>> from EvoMSA.text_repr import unique
>>> D = ['hi', 'bye', 'HI.']
>>> unique(D, lang='en')
['hi', 'bye']
"""
from tqdm import tqdm

def unique_self(elementos):
_ = X[elementos]
sim = np.dot(_, _.T) >= alpha
indptr = sim.indptr
indices = sim.indices
remove = []
index = np.where(np.diff(indptr) > 1)[0]
init = indptr[index]
index = index[index == indices[init]]
for i, j in zip(index, index+1):
remove.extend(indices[indptr[i]:indptr[j]][1:].tolist())
remove = set(remove)
_ = [i for k, i in enumerate(elementos)
if k not in remove]
return np.array(_)


def unique_rest(frst, rest):
sim = np.dot(X[rest], X[frst].T).max(axis=1).toarray()
mask = sim.flatten() < alpha
return rest[mask]

bow_params = dict() if bow_params is None else bow_params
bow_params.update(lang=lang)
transform = BoW(**bow_params).transform
X = transform(D)
init = 0
pool = np.arange(len(D))
with tqdm(total=pool.shape[0]) as _tqdm:
while init < pool.shape[0]:
past = pool[:init]
elementos = pool[init:init + batch_size]
ele_size = elementos.shape[0]
rest = pool[init + batch_size:]
frst = unique_self(elementos)
r_size = rest.shape[0]
rest = unique_rest(frst, rest)
_tqdm.update(ele_size + r_size - rest.shape[0])
init = init + frst.shape[0]
pool = np.r_[past, frst, rest]
if G is not None:
G = transform(G)
sim = np.dot(X[pool], G.T).max(axis=1).toarray()
mask = sim.flatten() < alpha
pool = pool[mask]
if return_index:
return pool
return [D[x] for x in pool]
3 changes: 3 additions & 0 deletions environment.yml
Expand Up @@ -6,3 +6,6 @@ dependencies:
- evodag
- b4msa
- cython
- jax
- optax
- IngeoML

0 comments on commit 1054a58

Please sign in to comment.