Merge pull request #42 from zaibacu/shortcuts-module

Shortcuts module
zaibacu · Jan 25, 2020 · 718ddfe · 718ddfe
2 parents 1fdb95e + 8838668
commit 718ddfe
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,22 @@
-None 0.3.2 (2019-12-19)
+0.4.0 (2020-01-25)
+****************************
+
+Features
+--------
+
+- Support for deaccent. In general, if accented version of word is given, both deaccented and accented will be used to match. To turn iit off - `!CONFIG("deaccent", "N")`
+  #38
+- Added shortcuts module to simplify injecting into spaCy
+  #42
+
+Fix
+---
+
+- Fix issue regarding Spacy rules with `IN_LIST` and using case-sensitive mode. It was creating Regex pattern which is not valid spacy pattern
+  #40
+
+
+0.3.2 (2019-12-19)
 ***********************
 
 Features

diff --git a/README.md b/README.md
@@ -20,3 +20,49 @@ This is a language, loosely based on language [Apache UIMA RUTA](https://uima.ap
 ## Support
 
 [![reddit](https://img.shields.io/reddit/subreddit-subscribers/ritaDSL?style=social)](https://www.reddit.com/r/ritaDSL/)
+
+
+## Simple Rules example
+
+```python
+rules = """
+cuts = {"fitted", "wide-cut"}
+lengths = {"short", "long", "calf-length", "knee-length"}
+fabric_types = {"soft", "airy", "crinkled"}
+fabrics = {"velour", "chiffon", "knit", "woven", "stretch"}
+
+{IN_LIST(cuts)?, IN_LIST(lengths), WORD("dress")}->MARK("DRESS_TYPE")
+{IN_LIST(lengths), IN_LIST(cuts), WORD("dress")}->MARK("DRESS_TYPE")
+{IN_LIST(fabric_types)?, IN_LIST(fabrics)}->MARK("DRESS_FABRIC")
+"""
+```
+
+### Loading in spaCy
+```python
+import spacy
+from rita.shortcuts import setup_spacy
+
+
+nlp = spacy.load("en")
+setup_spacy(nlp, rules_string=rules)
+```
+
+And using it:
+```
+>>> r = nlp("She was wearing a short wide-cut dress")
+>>> [{"label": e.label_, "text": e.text} for e in r.ents]
+[{'label': 'DRESS_TYPE', 'text': 'short wide-cut dress'}]
+```
+
+### Loading using Regex (standalone)
+```
+import rita
+
+patterns = rita.compile_string(rules, use_engine="standalone")
+```
+
+And using it:
+```
+>>> list(patterns.execute("She was wearing a short wide-cut dress"))
+[{'end': 38, 'label': 'DRESS_TYPE', 'start': 18, 'text': 'short wide-cut dress'}]
+```
diff --git a/changes/38.feature.rst b/changes/38.feature.rst
diff --git a/changes/40.fix.rst b/changes/40.fix.rst
diff --git a/changes/42.feature.rst b/changes/42.feature.rst
@@ -0,0 +1 @@
+Added shortcuts module to simplify injecting into spaCy
diff --git a/docs/quickstart.md b/docs/quickstart.md
@@ -22,6 +22,23 @@ Now you can compile these rules `rita -f <your-file>.rita output.jsonl`
 
 ## spaCy backend
 
+### NEW in 0.4.0: Shortcuts to simplify life:
+```
+import spacy
+from rita.shortcuts import setup_spacy
+
+nlp = spacy.load("en")
+setup_spacy(nlp, ...)
+```
+
+If comipling rules from string:
+`setup_spacy(nlp, rules_string=rules)`
+If loading rules from `.rita` file
+`setup_spacy(nlp, rules_path="examples/car-colors.rita")`
+If loading from spaCy compiled rules:
+`setup_spacy(nlp, patterns="rules.jsonl")`
+
+### Doing it manually
 ```python
 import spacy
 from spacy.pipeline import EntityRuler

diff --git a/rita/__init__.py b/rita/__init__.py
@@ -9,7 +9,7 @@
 
 logger = logging.getLogger(__name__)
 
-__version__ = (0, 3, 4, os.getenv("VERSION_PATCH"))
+__version__ = (0, 4, 0, os.getenv("VERSION_PATCH"))
 
 
 def get_version():
@@ -39,8 +39,8 @@ def compile_string(raw, config, use_engine=None):
         return result
 
 
-def compile(fname, compile_fn=None):
+def compile(fname, use_engine=None):
     with open(fname, "r") as f:
         raw = f.read()
 
-    return compile_string(raw)
+    return compile_string(raw, use_engine=use_engine)
diff --git a/rita/shortcuts.py b/rita/shortcuts.py
@@ -0,0 +1,20 @@
+import rita
+
+
+def setup_spacy(model, patterns=None, rules_path=None, rules_string=None, override_ents=True):
+    from spacy.pipeline import EntityRuler
+    ruler = EntityRuler(model, overwrite_ents=override_ents)
+    if not patterns:
+        if rules_path:
+            patterns = rita.compile(rules_path, use_engine="spacy")
+        elif rules_string:
+            patterns = rita.compile_string(rules_string, use_engine="spacy")
+        else:
+            raise RuntimeError("Please provides rules. Either `patterns`, `rules_path` or `rules_string`")
+
+        ruler.add_patterns(patterns)
+    else:
+        ruler.from_disk(patterns)
+
+    model.add_pipe(ruler)
+    return model
diff --git a/tests/test_run.py b/tests/test_run.py
@@ -1,6 +1,12 @@
 import sys
+import json
+import tempfile
+
+import pytest
+import rita
 
 from rita.run import main
+from rita.shortcuts import setup_spacy
 
 
 def test_simple_compile(mocker):
@@ -11,3 +17,35 @@ def test_simple_compile(mocker):
         "output.jsonl"
     ]
     main()
+
+
+def test_shortcuts_spacy_inline():
+    spacy = pytest.importorskip("spacy", minversion="2.1")
+    nlp = spacy.load("en")
+    rules = """
+    {WORD("TEST")}->MARK("TEST")
+    """
+    setup_spacy(nlp, rules_string=rules)
+
+
+def test_shortcuts_spacy_file():
+    spacy = pytest.importorskip("spacy", minversion="2.1")
+    nlp = spacy.load("en")
+    setup_spacy(nlp, rules_path="examples/color-car.rita")
+
+
+def test_shortcuts_spacy_compiled():
+    spacy = pytest.importorskip("spacy", minversion="2.1")
+    nlp = spacy.load("en")
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl") as f:
+        patterns = rita.compile("examples/color-car.rita")
+        for pattern in patterns:
+            f.write(json.dumps(pattern) + "\n")
+        setup_spacy(nlp, patterns=f.name)
+
+
+def test_shortcuts_spacy_giving_no_rules():
+    spacy = pytest.importorskip("spacy", minversion="2.1")
+    nlp = spacy.load("en")
+    with pytest.raises(RuntimeError):
+        setup_spacy(nlp)