explosion · india-kerle · Feb 19, 2024 · Feb 29, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
@@ -1,4 +1,5 @@
 from .attributeruler import AttributeRuler
+#from .coordinationruler import CoordinationSplitter
 from .dep_parser import DependencyParser
 from .edit_tree_lemmatizer import EditTreeLemmatizer
 from .entity_linker import EntityLinker
@@ -21,6 +22,7 @@
 
 __all__ = [
     "AttributeRuler",
+    #"CoordinationSplitter",
     "DependencyParser",
     "EditTreeLemmatizer",
     "EntityLinker",

diff --git a/spacy/pipeline/coordinationruler.py b/spacy/pipeline/coordinationruler.py
@@ -0,0 +1,187 @@
+from typing import List, Callable, Optional, Union
+from pydantic import BaseModel, validator
+import re
+
+from ..tokens import Doc
+from ..language import Language
+from ..vocab import Vocab
+from .pipe import Pipe
+
+########### DEFAULT COORDINATION SPLITTING RULES ##############
+
+def split_noun_coordination(doc: Doc) -> Union[List[str], None]:
+    """Identifies and splits phrases with multiple nouns, a modifier
+        and a conjunction.
+
+    Examples:
+        - "apples and oranges" -> None
+        - "green apples and oranges" -> ["green apples", "green oranges"]
+        - "green apples and rotten oranges" -> None
+        - "apples and juicy oranges" -> ["juicy apples", "juicy oranges"]
+        - "hot chicken wings and soup" -> ["hot chicken wings", "hot soup"]
+        - "spicy ice cream and chicken wings" -> ["spicy ice cream", "spicy chicken wings"]
+
+    Args:
+        doc (Doc): The input document.
+
+    Returns:
+        Union[List[str], None]: A list of the coordinated noun phrases, 
+            or None if no coordinated noun phrases are found.
+    """
+    def _split_doc(doc: Doc) -> bool:
+        noun_modified = False
+        has_conjunction = False
+
+        for token in doc:
+            if token.head.pos_ == 'NOUN': ## check to see that the phrase is a noun phrase
+                has_modifier = any(child.dep_ == 'amod' for child in token.head.children) #check to see if the noun has a modifier
+                if has_modifier:
+                    noun_modified = True
+            # check if there is a conjunction linked directly to a noun
+            if token.dep_ == 'conj' and token.head.pos_ == 'NOUN':
+                has_conjunction = True
+
+        return True if noun_modified and has_conjunction else False
+
+    phrases = []
+    modified_nouns = set()  
+    to_split = _split_doc(doc)
+
+    if to_split: 
+        for token in doc:
+            if token.dep_ == "amod" and token.head.pos_ == "NOUN":
+                modifier = token.text
+                head_noun = token.head
+
+                if head_noun not in modified_nouns:
+                    nouns_to_modify = [head_noun] + list(head_noun.conjuncts)
+
+                    for noun in nouns_to_modify:
+                        compound_parts = [child.text for child in noun.lefts if child.dep_ == "compound"]
+                        complete_noun_phrase = " ".join(compound_parts + [noun.text])        
+                        phrases.append(f"{modifier} {complete_noun_phrase}")
+                        modified_nouns.add(noun)  # Mark this noun as modified
+
+        return phrases if phrases != [] else None
+    else:
+        return None
+
+
+###############################################################
+
+# class SplittingRule(BaseModel):
+#     function: Callable[[Doc], Union[List[str], None]]
+
+#     @validator("function")
+#     def check_return_type(cls, v):
+#         nlp = en_core_web_sm.load()
+#         dummy_doc = nlp("This is a dummy sentence.")
+#         result = v(dummy_doc)
+#         if result is not None:
+#             if not isinstance(result, List):
+#                 raise ValueError(
+#                     "The custom splitting rule must return None or a list."
+#                 )
+#             elif not all(isinstance(item, str) for item in result):
+#                 raise ValueError(
+#                     "The custom splitting rule must return None or a list of strings."
+#                 )
+#         return v
+
+
+# @Language.factory(
+#     "coordination_splitter", requires=["token.dep", "token.tag", "token.pos"]
+# )
+# def make_coordination_splitter(nlp: Language, name: str):
+#     """Make a CoordinationSplitter component.
+
+#     the default splitting rules include:
+
+#     - _split_duplicate_object: Split a text with 2 verbs and 1 object (and optionally a subject) into two texts each with 1 verb, the shared object (and its modifiers), and the subject if present.
+#     - _split_duplicate_verb: Split a text with 1 verb and 2 objects into two texts each with 1 verb and 1 object.
+#     - _split_skill_mentions: Split a text with 2 skills into 2 texts with 1 skill (the phrase must end with 'skills' and the skills must be separated by 'and')
+
+
+#     Args:
+#         nlp (Language): The spaCy Language object.
+#         name (str): The name of the component.
+
+#     RETURNS The CoordinationSplitter component.
+
+#     DOCS: xxx
+#     """
+
+#     return CoordinationSplitter(nlp.vocab, name=name)
+
+
+# class CoordinationSplitter(Pipe):
+#     def __init__(
+#         self,
+#         vocab: Vocab,
+#         name: str = "coordination_splitter",
+#         rules: Optional[List[SplittingRule]] = None,
+#     ) -> None:
+#         self.name = name
+#         self.vocab = vocab
+#         if rules is None:
+#             default_rules = [
+#                 _split_duplicate_object,
+#                 _split_duplicate_verb,
+#                 _split_skill_mentions,
+#             ]
+#             self.rules = [SplittingRule(function=rule) for rule in default_rules]
+#         else:
+#             # Ensure provided rules are wrapped in SplittingRule instances
+#             self.rules = [
+#                 rule
+#                 if isinstance(rule, SplittingRule)
+#                 else SplittingRule(function=rule)
+#                 for rule in rules
+#             ]
+
+#     def clear_rules(self) -> None:
+#         """Clear the default splitting rules."""
+#         self.rules = []
+
+#     def add_default_rules(self) -> List[SplittingRule]:
+#         """Reset the default splitting rules."""
+#         default_rules = [
+#             _split_duplicate_object,
+#             _split_duplicate_verb,
+#             _split_skill_mentions,
+#         ]
+#         self.rules = [SplittingRule(function=rule) for rule in default_rules]
+
+#     def add_rule(self, rule: Callable[[Doc], Union[List[str], None]]) -> None:
+#         """Add a single splitting rule to the default rules."""
+#         validated_rule = SplittingRule(function=rule)
+#         self.rules.append(validated_rule)
+
+#     def add_rules(self, rules: List[Callable[[Doc], Union[List[str], None]]]) -> None:
+#         """Add a list of splitting rules to the default rules.
+
+#         Args:
+#             rules (List[Callable[[Doc], Union[List[str], None]]]): A list of functions to be added as splitting rules.
+#         """
+#         for rule in rules:
+#             # Wrap each rule in a SplittingRule instance to ensure it's validated
+#             validated_rule = SplittingRule(function=rule)
+#             self.rules.append(validated_rule)
+
+#     def __call__(self, doc: Doc) -> Doc:
+#         """Apply the splitting rules to the doc.
+
+#         Args:
+#             doc (Doc): The spaCy Doc object.
+
+#         Returns:
+#             Doc: The modified spaCy Doc object.
+#         """
+#         if doc.lang_ != "en":
+#             return doc
+
+#         for rule in self.rules:
+#             split = rule.function(doc)
+#             if split:
+#                 return Doc(doc.vocab, words=split)
+#         return doc
diff --git a/spacy/tests/pipeline/test_coordinationruler.py b/spacy/tests/pipeline/test_coordinationruler.py
@@ -0,0 +1,166 @@
+import pytest
+from typing import List
+
+from spacy.tokens import Doc
+import spacy
+
+from spacy.pipeline.coordinationruler import split_noun_coordination
+
+@pytest.fixture
+def nlp():
+    return spacy.blank("en")
+
+### NOUN CONSTRUCTION CASES ###
+@pytest.fixture
+def noun_construction_case1(nlp):
+    words = ["apples", "and", "oranges"]
+    spaces = [True, True, False]  # Indicates whether the word is followed by a space
+    pos_tags = ["NOUN", "CCONJ", "NOUN"]
+    dep_relations = ["nsubj", "cc", "conj"]
+
+    doc = Doc(nlp.vocab, words=words, spaces=spaces)
+
+    #set pos_ and dep_ attributes
+    for token, pos, dep in zip(doc, pos_tags, dep_relations):
+        token.pos_ = pos
+        token.dep_ = dep
+
+    # # define head relationships manually
+    doc[1].head = doc[2]  # "and" -> "oranges"
+    doc[2].head = doc[0]  # "oranges" -> "apples"
+    doc[0].head = doc[0] 
+
+    return doc
+
+@pytest.fixture
+def noun_construction_case2(nlp):
+    words = ["red", "apples", "and", "oranges"]
+    spaces = [True, True, True, False]  # Indicates whether the word is followed by a space
+    pos_tags = ["ADJ", "NOUN", "CCONJ", "NOUN"]
+    dep_relations = ["amod", "nsubj", "cc", "conj"]
+
+    # Create a Doc object manually
+    doc = Doc(nlp.vocab, words=words, spaces=spaces)
+
+    #set pos_ and dep_ attributes
+    for token, pos, dep in zip(doc, pos_tags, dep_relations):
+        token.pos_ = pos
+        token.dep_ = dep
+
+    # define head relationships manually
+    doc[0].head = doc[1]  
+    doc[2].head = doc[3]  
+    doc[3].head = doc[1]  
+
+    return doc
+
+@pytest.fixture
+def noun_construction_case3(nlp):
+    words = ["apples", "and", "juicy", "oranges"]
+    spaces = [True, True, True, False]  # Indicates whether the word is followed by a space.
+    pos_tags = ["NOUN", "CCONJ", "ADJ", "NOUN"]
+    dep_relations = ["nsubj", "cc", "amod", "conj"]
+
+    #create a Doc object manually
+    doc = Doc(nlp.vocab, words=words, spaces=spaces)
+
+    #set POS and dependency tags
+    for token, pos, dep in zip(doc, pos_tags, dep_relations):
+        token.pos_ = pos
+        token.dep_ = dep
+
+    #defining head relationships manually
+    doc[0].head = doc[0]  # "apples" as root, pointing to itself for simplicity.
+    doc[1].head = doc[3]  # "and" -> "oranges"
+    doc[2].head = doc[3]  # "juicy" -> "oranges"
+    doc[3].head = doc[0]  # "oranges" -> "apples", indicating a conjunctive relationship
+
+    return doc
+
+@pytest.fixture
+def noun_construction_case4(nlp):
+    words = ["hot", "chicken", "wings", "and", "soup"]
+    spaces = [True, True, True, True, False]  # Indicates whether the word is followed by a space.
+    pos_tags= ["ADJ", "NOUN", "NOUN", "CCONJ", "NOUN"]
+    dep_relations = ["amod", "compound", "ROOT", "cc", "conj"]
+
+    doc = Doc(nlp.vocab, words=words, spaces=spaces)
+
+    for token, pos, dep in zip(doc, pos_tags, dep_relations):
+        token.pos_ = pos
+        token.dep_ = dep
+
+    # Define head relationships manually for "hot chicken wings and soup".
+    doc[0].head = doc[2]  # "hot" -> "wings"
+    doc[1].head = doc[2]  # "chicken" -> "wings"
+    doc[2].head = doc[2]  # "wings" as root
+    doc[3].head = doc[4]  # "and" -> "soup"
+    doc[4].head = doc[2]  # "soup" -> "wings"
+
+    return doc
+
+@pytest.fixture
+def noun_construction_case5(nlp):
+    words = ["green", "apples", "and", "rotten", "oranges"]
+    spaces = [True, True, True, True, False]  # Indicates whether the word is followed by a space.
+    pos_tags = ["ADJ", "NOUN", "CCONJ", "ADJ", "NOUN"]
+    dep_relations = ["amod", "ROOT", "cc", "amod", "conj"]
+
+    doc = Doc(nlp.vocab, words=words, spaces=spaces)
+
+    # Set POS and dependency tags.
+    for token, pos, dep in zip(doc, pos_tags, dep_relations):
+        token.pos_ = pos
+        token.dep_ = dep
+
+    # Define head relationships manually for "green apples and rotten oranges".
+    doc[0].head = doc[1]  # "green" -> "apples"
+    doc[1].head = doc[1]  # "apples" as root
+    doc[2].head = doc[4]  # "and" -> "oranges"
+    doc[3].head = doc[4]  # "rotten" -> "oranges"
+    doc[4].head = doc[1]  # "oranges" -> "apples"
+
+    return doc
+
+#test split_noun_coordination on 5 different cases
+def test_split_noun_coordination(noun_construction_case1, 
+                                 noun_construction_case2, 
+                                 noun_construction_case3, 
+                                 noun_construction_case4, 
+                                 noun_construction_case5):
+
+    #test 1: no modifier - it should return None from _split_doc
+    case1_split = split_noun_coordination(noun_construction_case1)
+
+    assert case1_split == None
+
+    #test 2: modifier is at the beginning of the noun phrase
+    case2_split = split_noun_coordination(noun_construction_case2)
+
+    assert len(case2_split) == 2
+    assert isinstance(case2_split, list)
+    assert all(isinstance(phrase, str) for phrase in case2_split)
+    assert case2_split == ["red apples", "red oranges"]
+
+
+    #test 3: modifier is at the end of the noun phrase
+    case3_split = split_noun_coordination(noun_construction_case3)
+
+    assert len(case3_split) == 2
+    assert isinstance(case3_split, list)
+    assert all(isinstance(phrase, str) for phrase in case3_split)
+    assert case3_split == ["juicy oranges", "juicy apples"]
+
+    #test 4: deal with compound nouns
+    case4_split = split_noun_coordination(noun_construction_case4)
+
+    assert len(case4_split) == 2
+    assert isinstance(case4_split, list)
+    assert all(isinstance(phrase, str) for phrase in case4_split)
+    assert case4_split == ["hot chicken wings", "hot soup"]
+
+
+    #test 5: multiple modifiers
+    case5_split = split_noun_coordination(noun_construction_case5)
+
+    pass #this should return none i think