Skip to content

Commit

Permalink
patching cve
Browse files Browse the repository at this point in the history
  • Loading branch information
alvations committed Sep 25, 2021
1 parent 61b6ed5 commit 8ad457e
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 1 deletion.
72 changes: 72 additions & 0 deletions sacremoses/test/test_no_redos_has_numeric_only.py
@@ -0,0 +1,72 @@
import re


import unittest
from collections import defaultdict

from sacremoses.corpus import NonbreakingPrefixes
from sacremoses.tokenize import MosesTokenizer



class HasNumericOnlyPatched(unittest.TestCase):
"""Testing if the functionality of the NUMERIC_ONLY_PREFIXES parsing is the same without redos-able regex."""

def test_expected_num_only_prefixes(self):
expected_prefixes = {'as': [], 'bn': [], 'ca': [], 'cs': [], 'de': [], 'el': [],
'en': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
('pp', 'pp #NUMERIC_ONLY#')],
'es': [], 'et': [], 'fi': [], 'fr': [],
'ga': [('lch', 'lch #NUMERIC_ONLY#'), ('lgh', 'lgh #NUMERIC_ONLY#'),
('uimh', 'uimh #NUMERIC_ONLY#')],
'gu': [], 'hi': [],
'hu': [('jan', 'jan #NUMERIC_ONLY#'), ('Jan', 'Jan #NUMERIC_ONLY#'),
('Feb', 'Feb #NUMERIC_ONLY#'), ('feb', 'feb #NUMERIC_ONLY#'),
('márc', 'márc #NUMERIC_ONLY#'), ('Márc', 'Márc #NUMERIC_ONLY#'),
('ápr', 'ápr #NUMERIC_ONLY#'), ('Ápr', 'Ápr #NUMERIC_ONLY#'),
('máj', 'máj #NUMERIC_ONLY#'), ('Máj', 'Máj #NUMERIC_ONLY#'),
('jún', 'jún #NUMERIC_ONLY#'), ('Jún', 'Jún #NUMERIC_ONLY#'),
('Júl', 'Júl #NUMERIC_ONLY#'), ('júl', 'júl #NUMERIC_ONLY#'),
('aug', 'aug #NUMERIC_ONLY#'), ('Aug', 'Aug #NUMERIC_ONLY#'),
('Szept', 'Szept #NUMERIC_ONLY#'), ('szept', 'szept #NUMERIC_ONLY#'),
('okt', 'okt #NUMERIC_ONLY#'), ('Okt', 'Okt #NUMERIC_ONLY#'),
('nov', 'nov #NUMERIC_ONLY#'), ('Nov', 'Nov #NUMERIC_ONLY#'),
('dec', 'dec #NUMERIC_ONLY#'), ('Dec', 'Dec #NUMERIC_ONLY#'),
('tel', 'tel #NUMERIC_ONLY#'), ('Tel', 'Tel #NUMERIC_ONLY#'),
('Fax', 'Fax #NUMERIC_ONLY#'), ('fax', 'fax #NUMERIC_ONLY#')],
'is': [('no', 'no #NUMERIC_ONLY#'), ('No', 'No #NUMERIC_ONLY#'),
('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'),
('nR', 'nR #NUMERIC_ONLY#'), ('NR', 'NR #NUMERIC_ONLY#')],
'it': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
('pp', 'pp #NUMERIC_ONLY#')], 'kn': [],
'lt': [('No', 'No #NUMERIC_ONLY#')],
'lv': [('Nr', 'Nr #NUMERIC_ONLY#')],
'ml': [], 'mni': [], 'mr': [],
'nl': [('Nr', 'Nr #NUMERIC_ONLY#'), ('nr', 'nr #NUMERIC_ONLY#')],
'or': [], 'pa': [],
'pl': [('nr', 'nr #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#'),
('pkt', 'pkt #NUMERIC_ONLY#'), ('str', 'str #NUMERIC_ONLY#'),
('tab', 'tab #NUMERIC_ONLY#'), ('Tab', 'Tab #NUMERIC_ONLY#'),
('ust', 'ust #NUMERIC_ONLY#'), ('par', 'par #NUMERIC_ONLY#'),
('r', 'r #NUMERIC_ONLY#'), ('l', 'l #NUMERIC_ONLY#'),
('s', 's #NUMERIC_ONLY#')],
'pt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')],
'ro': [], 'ru': [], 'sk': [],
'sl': [('št', 'št #NUMERIC_ONLY#'), ('Št', 'Št #NUMERIC_ONLY#')],
'sv': [], 'ta': [], 'te': [],
'tdt': [('No', 'No #NUMERIC_ONLY#'), ('Art', 'Art #NUMERIC_ONLY#'),
('p', 'p #NUMERIC_ONLY#'), ('pp', 'pp #NUMERIC_ONLY#')],
'yue': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')],
'zh': [('No', 'No #NUMERIC_ONLY#'), ('Nr', 'Nr #NUMERIC_ONLY#')]}

nonbreaking_prefixes = NonbreakingPrefixes()
moses = MosesTokenizer()
lang2numonlyprefix = defaultdict(list)


for lang in nonbreaking_prefixes.available_langs.values():
lang2numonlyprefix[lang] = [(w.rpartition(" ")[0], w)
for w in nonbreaking_prefixes.words(lang) if moses.has_numeric_only(w)]

assert lang2numonlyprefix == expected_prefixes
2 changes: 1 addition & 1 deletion sacremoses/tokenize.py
Expand Up @@ -363,7 +363,7 @@ def isanyalpha(self, text):
return any(set(text).intersection(set(self.IsAlpha)))

def has_numeric_only(self, text):
return bool(re.search(r"(.*)[\s]+(\#NUMERIC_ONLY\#)", text))
return bool(re.search(r"[\s]+(\#NUMERIC_ONLY\#)", text))

def handles_nonbreaking_prefixes(self, text):
# Splits the text into tokens to check for nonbreaking prefixes.
Expand Down

0 comments on commit 8ad457e

Please sign in to comment.