Merge pull request #64 from zaibacu/allow-store-patterns-in-variables

Allow store patterns in variables
zaibacu · Jun 18, 2020 · 215a119 · 215a119
2 parents c6c1600 + 82f7dc8
commit 215a119
Show file tree

Hide file tree

Showing 22 changed files with 148 additions and 54 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,42 @@
+0.5.0 (2020-06-18)
+****************************
+
+Features
+--------
+
+- Added `PREFIX` macro which allows to attach word in front of list items or words
+  #47
+- Allow to pass variables directly when doing `compile` and `compile_string`
+  #51
+- Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported)
+  #53
+- Added ability to import rule files into rule file. Recursive import is supported as well.
+  #55
+- Added possibility to define pattern as a variable and reuse it in other patterns:
+
+  Example:
+  .. code-block:: RITA
+
+      ComplexNumber = {NUM+, WORD("/")?, NUM?}
+
+      {PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
+
+      {PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
+  #64
+
+Fix
+---
+
+- Fix issue with multiple wildcard words using standalone engine
+  #46
+- Don't crash when no rules are provided
+  #50
+- Fix Number and ANY-OF parsing
+  #59
+- Allow escape characters inside LITERAL
+  #62
+
+
 0.4.0 (2020-01-25)
 ****************************
 

diff --git a/changes/46.fix.rst b/changes/46.fix.rst
diff --git a/changes/47.feature.rst b/changes/47.feature.rst
diff --git a/changes/50.fix.rst b/changes/50.fix.rst
diff --git a/changes/51.feature.rst b/changes/51.feature.rst
diff --git a/changes/53.feature.rst b/changes/53.feature.rst
diff --git a/changes/55.feature.rst b/changes/55.feature.rst
diff --git a/changes/59.fix.rst b/changes/59.fix.rst
diff --git a/changes/62.fix.rst b/changes/62.fix.rst
diff --git a/docs/advanced.md b/docs/advanced.md
@@ -11,3 +11,14 @@ Eg.:
 ```
 @import "examples/simple-match.rita"
 ```
+
+# Reusing patterns
+
+You can define (since version 0.5.0+) pattern as a variable:
+
+```
+ComplexNumber = {NUM+, WORD("/")?, NUM?}
+
+{PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
+{PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
+```
diff --git a/examples/complex-number.rita b/examples/complex-number.rita
@@ -0,0 +1,3 @@
+Complex_Number = { NUM+, WORD("/")?, NUM? }
+{PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH")
+{PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "rita-dsl"
-version = "0.4.7"
+version = "0.5.0"
 description = "DSL for building language rules"
 authors = [
     "Šarūnas Navickas <zaibacu@gmail.com>"

diff --git a/rita/__init__.py b/rita/__init__.py
@@ -10,7 +10,7 @@
 
 logger = logging.getLogger(__name__)
 
-__version__ = (0, 4, 7, os.getenv("VERSION_PATCH"))
+__version__ = (0, 5, 0, os.getenv("VERSION_PATCH"))
 
 
 def get_version():

diff --git a/rita/engine/translate_spacy.py b/rita/engine/translate_spacy.py
@@ -88,7 +88,7 @@ def phrase_parse(value, config, op=None):
 
 
 def rules_to_patterns(label, data, config):
-    print(data)
+    logger.debug(data)
     return {
         "label": label,
         "pattern": [p

diff --git a/rita/engine/translate_standalone.py b/rita/engine/translate_standalone.py
@@ -91,7 +91,7 @@ def gen():
         yield data[0]
 
         for (t, d, op) in data[1:]:
-            yield (t, d, op)
+            yield t, d, op
 
     return (
         label,

diff --git a/rita/macros.py b/rita/macros.py
@@ -1,28 +1,12 @@
 import logging
 import types
 
-from itertools import chain
+from rita.utils import flatten
 
 logger = logging.getLogger(__name__)
 
 
-def flatten(lst):
-    if len(lst) > 1:
-        return lst
-
-    def explode(v):
-        if callable(v):
-            return v()
-        else:
-            return v
-
-    new_lst = map(explode, lst)
-    return chain(*new_lst)
-
-
 def resolve_value(obj, config):
-    context = []
-
     logger.debug("Resolving value: {0}".format(obj))
 
     if isinstance(obj, str):
@@ -32,9 +16,7 @@ def resolve_value(obj, config):
         return obj
 
     elif isinstance(obj, list):
-        for item in obj:
-            context.append(item)
-        return context
+        return obj
 
     elif isinstance(obj, types.GeneratorType):
         return "either", list(obj), None
@@ -69,10 +51,8 @@ def ASSIGN(k, v, config, op=None):
 
 
 def IN_LIST(*args, config, op=None):
-    variants = []
-    for arg in flatten(args):
-        variants.append(resolve_value(arg, config=config))
-    return "any_of", variants, None
+    return "any_of", [resolve_value(arg, config=config)
+                      for arg in flatten(args)], None
 
 
 def PATTERN(*args, config, op=None):

diff --git a/rita/parser.py b/rita/parser.py
@@ -195,7 +195,5 @@ def build(self, **kwargs):
     def parse(self, data):
         if data.strip() == "":
             return []
-        print(data)
-        print(r"{}".format(data))
 
         return self.parser.parse(r"{}".format(data), lexer=self.lexer, debug=logger)
diff --git a/rita/preprocess.py b/rita/preprocess.py
@@ -3,6 +3,7 @@
 from functools import reduce
 
 from rita.utils import Node, deaccent
+from rita.macros import resolve_value
 
 logger = logging.getLogger(__name__)
 
@@ -17,7 +18,7 @@ def apply_prefix(pattern, prefix):
             return (name, list(["{0}{1}".format(prefix, item)
                                 for item in args]), op)
         elif name == "value":
-            return (name, "{0}{1}".format(prefix, args), op)
+            return name, "{0}{1}".format(prefix, args), op
         else:
             logger.warning("Don't know how to apply prefix on: {}".format(name))
             return pattern
@@ -35,7 +36,7 @@ def gen():
                 else:
                     yield p
     for group_label, pattern in rules:
-        yield (group_label, list(gen()))
+        yield group_label, list(gen())
 
 
 def handle_deaccent(rules, config):
@@ -51,7 +52,7 @@ def gen():
                 if name == "value":
                     (v1, v2) = (args, deaccent(args))
                     if v1 != v2:
-                        yield ("any_of", (v1, v2,), op)
+                        yield "any_of", (v1, v2,), op
                     else:
                         yield p
                 elif name == "any_of":
@@ -64,11 +65,11 @@ def items():
                             else:
                                 yield v1
 
-                    yield ("any_of", list(items()), op)
+                    yield "any_of", list(items()), op
                 else:
                     yield p
 
-        yield (group_label, list(gen()))
+        yield group_label, list(gen())
 
 
 def add_implicit_punct(rules, config):
@@ -81,12 +82,12 @@ def add_implicit_punct(rules, config):
         def gen():
             for p in pattern:
                 yield p
-                yield ("punct", None, "?")
+                yield "punct", None, "?"
 
         if len(pattern) == 1:
-            yield (group_label, pattern)
+            yield group_label, pattern
         else:
-            yield (group_label, list(gen())[:-1])
+            yield group_label, list(gen())[:-1]
 
 
 def handle_multi_word(rules, config):
@@ -104,11 +105,11 @@ def gen():
             for p in pattern:
                 (name, args, op) = p
                 if name == "value" and is_complex(args):
-                    yield ("phrase", args, op)
+                    yield "phrase", args, op
                 else:
                     yield p
 
-        yield (group_label, list(gen()))
+        yield group_label, list(gen())
 
 
 def is_complex(arg):
@@ -175,15 +176,15 @@ def handle_rule_branching(rules, config):
         if any([p == "either"
                 for (p, _, _) in pattern]):
             for p in branch_pattern(pattern, config):
-                yield (group_label, p)
+                yield group_label, p
 
         # Covering case when there are complex items in list
         elif any([p == "any_of" and has_complex(o)
                   for (p, o, _) in pattern]):
             for p in branch_pattern(pattern, config):
-                yield (group_label, p)
+                yield group_label, p
         else:
-            yield (group_label, pattern)
+            yield group_label, pattern
 
 
 def dummy(rules, config):
@@ -195,7 +196,23 @@ def dummy(rules, config):
 
 
 def rule_tuple(d):
-    return (d["label"], d["data"])
+    return d["label"], d["data"]
+
+
+def expand_patterns(rules, config):
+    """
+    We can have situations where inside pattern we have another pattern (via Variable).
+    We want to expand this inner pattern and prepend to outer pattern
+    """
+    for group_label, pattern in rules:
+        def gen():
+            for p in pattern:
+                if callable(p):
+                    yield resolve_value(p, config=config)
+                else:
+                    yield p
+
+        yield group_label, list(gen())
 
 
 def preprocess_rules(root, config):
@@ -205,7 +222,7 @@ def preprocess_rules(root, config):
              for doc in root
              if doc and doc()]
 
-    pipeline = [dummy, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix]
+    pipeline = [dummy, expand_patterns, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix]
 
     if config.implicit_punct:
         logger.info("Adding implicit Punctuations")

diff --git a/rita/utils.py b/rita/utils.py
@@ -1,7 +1,7 @@
 import logging
 
 from unicodedata import normalize, category
-from itertools import cycle
+from itertools import cycle, chain
 
 logger = logging.getLogger(__name__)
 
@@ -100,3 +100,20 @@ def deaccent(text):
                      "".join(c
                              for c in normalize("NFD", text)
                              if category(c) != "Mn"))
+
+
+def flatten(lst, shallow=False):
+    def explode(v):
+        if callable(v):
+            return v()
+        else:
+            return v
+
+    if len(lst) > 1 and not shallow:
+        return lst
+
+    new_lst = map(explode, lst)
+    if shallow:
+        return new_lst
+    else:
+        return chain(*new_lst)
diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -197,3 +197,14 @@ def parse_rows(parser, rows):
         iterations=3,
         rounds=3
     )
+
+
+@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine])
+def test_variable_pattern(engine):
+    parser = engine(load_rules("examples/complex-number.rita"))
+    text = """
+        It is 17 1/2 inches width and 10 inches height
+        """
+
+    results = parser(text)
+    assert len(results) == 2
diff --git a/tests/test_lexer.py b/tests/test_lexer.py
@@ -126,3 +126,14 @@ def test_tokenize_variable_w_escape():
     assert tokens[2].type == "LITERAL"
     assert tokens[4].type == "ARROW"
     assert tokens[5].type == "KEYWORD"
+
+
+def test_pattern_in_variable():
+    lex = RitaLexer()
+    lex.build()
+
+    tokens = list(
+        lex.tokenize(r'COMPLEX_NUMBER = {NUM+, WORD("/")?, NUM}')
+    )
+
+    assert len(tokens) == 14
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -210,3 +210,18 @@ def test_parser_literal_w_escape(config):
     )
 
     assert len(results) == 1
+
+
+def test_parser_pattern_in_variable(config):
+    p = RitaParser(config)
+    p.build(debug=True)
+
+    results = p.parse(
+        '''
+        Complex_Number = { NUM+, WORD("/")?, NUM? }
+        {PATTERN(Complex_Number), WORD("inch")}->MARK("WIDTH")
+        '''
+    )
+
+    print(results)
+    assert len(results) == 2