Skip to content

Commit

Permalink
Merge pull request #64 from zaibacu/allow-store-patterns-in-variables
Browse files Browse the repository at this point in the history
Allow store patterns in variables
  • Loading branch information
zaibacu committed Jun 18, 2020
2 parents c6c1600 + 82f7dc8 commit 215a119
Show file tree
Hide file tree
Showing 22 changed files with 148 additions and 54 deletions.
39 changes: 39 additions & 0 deletions CHANGELOG.md
@@ -1,3 +1,42 @@
0.5.0 (2020-06-18)
****************************

Features
--------

- Added `PREFIX` macro which allows to attach word in front of list items or words
#47
- Allow to pass variables directly when doing `compile` and `compile_string`
#51
- Allow to compile (and later load) rules using rita CLI while using standalone engine (spacy is already supported)
#53
- Added ability to import rule files into rule file. Recursive import is supported as well.
#55
- Added possibility to define pattern as a variable and reuse it in other patterns:

Example:
.. code-block:: RITA

ComplexNumber = {NUM+, WORD("/")?, NUM?}

{PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")

{PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
#64

Fix
---

- Fix issue with multiple wildcard words using standalone engine
#46
- Don't crash when no rules are provided
#50
- Fix Number and ANY-OF parsing
#59
- Allow escape characters inside LITERAL
#62


0.4.0 (2020-01-25)
****************************

Expand Down
1 change: 0 additions & 1 deletion changes/46.fix.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/47.feature.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/50.fix.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/51.feature.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/53.feature.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/55.feature.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/59.fix.rst

This file was deleted.

1 change: 0 additions & 1 deletion changes/62.fix.rst

This file was deleted.

11 changes: 11 additions & 0 deletions docs/advanced.md
Expand Up @@ -11,3 +11,14 @@ Eg.:
```
@import "examples/simple-match.rita"
```

# Reusing patterns

You can define (since version 0.5.0+) pattern as a variable:

```
ComplexNumber = {NUM+, WORD("/")?, NUM?}
{PATTERN(ComplexNumber), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
{PATTERN(ComplexNumber), WORD("inches"), WORD("Width")}->MARK("WIDTH")
```
3 changes: 3 additions & 0 deletions examples/complex-number.rita
@@ -0,0 +1,3 @@
Complex_Number = { NUM+, WORD("/")?, NUM? }
{PATTERN(Complex_Number), WORD("inches"), WORD("Width")}->MARK("WIDTH")
{PATTERN(Complex_Number), WORD("inches"), WORD("Height")}->MARK("HEIGHT")
2 changes: 1 addition & 1 deletion pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "rita-dsl"
version = "0.4.7"
version = "0.5.0"
description = "DSL for building language rules"
authors = [
"Šarūnas Navickas <zaibacu@gmail.com>"
Expand Down
2 changes: 1 addition & 1 deletion rita/__init__.py
Expand Up @@ -10,7 +10,7 @@

logger = logging.getLogger(__name__)

__version__ = (0, 4, 7, os.getenv("VERSION_PATCH"))
__version__ = (0, 5, 0, os.getenv("VERSION_PATCH"))


def get_version():
Expand Down
2 changes: 1 addition & 1 deletion rita/engine/translate_spacy.py
Expand Up @@ -88,7 +88,7 @@ def phrase_parse(value, config, op=None):


def rules_to_patterns(label, data, config):
print(data)
logger.debug(data)
return {
"label": label,
"pattern": [p
Expand Down
2 changes: 1 addition & 1 deletion rita/engine/translate_standalone.py
Expand Up @@ -91,7 +91,7 @@ def gen():
yield data[0]

for (t, d, op) in data[1:]:
yield (t, d, op)
yield t, d, op

return (
label,
Expand Down
28 changes: 4 additions & 24 deletions rita/macros.py
@@ -1,28 +1,12 @@
import logging
import types

from itertools import chain
from rita.utils import flatten

logger = logging.getLogger(__name__)


def flatten(lst):
if len(lst) > 1:
return lst

def explode(v):
if callable(v):
return v()
else:
return v

new_lst = map(explode, lst)
return chain(*new_lst)


def resolve_value(obj, config):
context = []

logger.debug("Resolving value: {0}".format(obj))

if isinstance(obj, str):
Expand All @@ -32,9 +16,7 @@ def resolve_value(obj, config):
return obj

elif isinstance(obj, list):
for item in obj:
context.append(item)
return context
return obj

elif isinstance(obj, types.GeneratorType):
return "either", list(obj), None
Expand Down Expand Up @@ -69,10 +51,8 @@ def ASSIGN(k, v, config, op=None):


def IN_LIST(*args, config, op=None):
variants = []
for arg in flatten(args):
variants.append(resolve_value(arg, config=config))
return "any_of", variants, None
return "any_of", [resolve_value(arg, config=config)
for arg in flatten(args)], None


def PATTERN(*args, config, op=None):
Expand Down
2 changes: 0 additions & 2 deletions rita/parser.py
Expand Up @@ -195,7 +195,5 @@ def build(self, **kwargs):
def parse(self, data):
if data.strip() == "":
return []
print(data)
print(r"{}".format(data))

return self.parser.parse(r"{}".format(data), lexer=self.lexer, debug=logger)
47 changes: 32 additions & 15 deletions rita/preprocess.py
Expand Up @@ -3,6 +3,7 @@
from functools import reduce

from rita.utils import Node, deaccent
from rita.macros import resolve_value

logger = logging.getLogger(__name__)

Expand All @@ -17,7 +18,7 @@ def apply_prefix(pattern, prefix):
return (name, list(["{0}{1}".format(prefix, item)
for item in args]), op)
elif name == "value":
return (name, "{0}{1}".format(prefix, args), op)
return name, "{0}{1}".format(prefix, args), op
else:
logger.warning("Don't know how to apply prefix on: {}".format(name))
return pattern
Expand All @@ -35,7 +36,7 @@ def gen():
else:
yield p
for group_label, pattern in rules:
yield (group_label, list(gen()))
yield group_label, list(gen())


def handle_deaccent(rules, config):
Expand All @@ -51,7 +52,7 @@ def gen():
if name == "value":
(v1, v2) = (args, deaccent(args))
if v1 != v2:
yield ("any_of", (v1, v2,), op)
yield "any_of", (v1, v2,), op
else:
yield p
elif name == "any_of":
Expand All @@ -64,11 +65,11 @@ def items():
else:
yield v1

yield ("any_of", list(items()), op)
yield "any_of", list(items()), op
else:
yield p

yield (group_label, list(gen()))
yield group_label, list(gen())


def add_implicit_punct(rules, config):
Expand All @@ -81,12 +82,12 @@ def add_implicit_punct(rules, config):
def gen():
for p in pattern:
yield p
yield ("punct", None, "?")
yield "punct", None, "?"

if len(pattern) == 1:
yield (group_label, pattern)
yield group_label, pattern
else:
yield (group_label, list(gen())[:-1])
yield group_label, list(gen())[:-1]


def handle_multi_word(rules, config):
Expand All @@ -104,11 +105,11 @@ def gen():
for p in pattern:
(name, args, op) = p
if name == "value" and is_complex(args):
yield ("phrase", args, op)
yield "phrase", args, op
else:
yield p

yield (group_label, list(gen()))
yield group_label, list(gen())


def is_complex(arg):
Expand Down Expand Up @@ -175,15 +176,15 @@ def handle_rule_branching(rules, config):
if any([p == "either"
for (p, _, _) in pattern]):
for p in branch_pattern(pattern, config):
yield (group_label, p)
yield group_label, p

# Covering case when there are complex items in list
elif any([p == "any_of" and has_complex(o)
for (p, o, _) in pattern]):
for p in branch_pattern(pattern, config):
yield (group_label, p)
yield group_label, p
else:
yield (group_label, pattern)
yield group_label, pattern


def dummy(rules, config):
Expand All @@ -195,7 +196,23 @@ def dummy(rules, config):


def rule_tuple(d):
return (d["label"], d["data"])
return d["label"], d["data"]


def expand_patterns(rules, config):
"""
We can have situations where inside pattern we have another pattern (via Variable).
We want to expand this inner pattern and prepend to outer pattern
"""
for group_label, pattern in rules:
def gen():
for p in pattern:
if callable(p):
yield resolve_value(p, config=config)
else:
yield p

yield group_label, list(gen())


def preprocess_rules(root, config):
Expand All @@ -205,7 +222,7 @@ def preprocess_rules(root, config):
for doc in root
if doc and doc()]

pipeline = [dummy, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix]
pipeline = [dummy, expand_patterns, handle_deaccent, handle_rule_branching, handle_multi_word, handle_prefix]

if config.implicit_punct:
logger.info("Adding implicit Punctuations")
Expand Down
19 changes: 18 additions & 1 deletion rita/utils.py
@@ -1,7 +1,7 @@
import logging

from unicodedata import normalize, category
from itertools import cycle
from itertools import cycle, chain

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -100,3 +100,20 @@ def deaccent(text):
"".join(c
for c in normalize("NFD", text)
if category(c) != "Mn"))


def flatten(lst, shallow=False):
def explode(v):
if callable(v):
return v()
else:
return v

if len(lst) > 1 and not shallow:
return lst

new_lst = map(explode, lst)
if shallow:
return new_lst
else:
return chain(*new_lst)
11 changes: 11 additions & 0 deletions tests/test_examples.py
Expand Up @@ -197,3 +197,14 @@ def parse_rows(parser, rows):
iterations=3,
rounds=3
)


@pytest.mark.parametrize('engine', [spacy_engine, standalone_engine])
def test_variable_pattern(engine):
parser = engine(load_rules("examples/complex-number.rita"))
text = """
It is 17 1/2 inches width and 10 inches height
"""

results = parser(text)
assert len(results) == 2
11 changes: 11 additions & 0 deletions tests/test_lexer.py
Expand Up @@ -126,3 +126,14 @@ def test_tokenize_variable_w_escape():
assert tokens[2].type == "LITERAL"
assert tokens[4].type == "ARROW"
assert tokens[5].type == "KEYWORD"


def test_pattern_in_variable():
lex = RitaLexer()
lex.build()

tokens = list(
lex.tokenize(r'COMPLEX_NUMBER = {NUM+, WORD("/")?, NUM}')
)

assert len(tokens) == 14
15 changes: 15 additions & 0 deletions tests/test_parser.py
Expand Up @@ -210,3 +210,18 @@ def test_parser_literal_w_escape(config):
)

assert len(results) == 1


def test_parser_pattern_in_variable(config):
p = RitaParser(config)
p.build(debug=True)

results = p.parse(
'''
Complex_Number = { NUM+, WORD("/")?, NUM? }
{PATTERN(Complex_Number), WORD("inch")}->MARK("WIDTH")
'''
)

print(results)
assert len(results) == 2

0 comments on commit 215a119

Please sign in to comment.