Skip to content

Commit

Permalink
Add an Ssurgeon operation which adds an (English only) lemma to text
Browse files Browse the repository at this point in the history
  • Loading branch information
AngledLuffa committed Dec 5, 2023
1 parent d302c63 commit c26b25e
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 0 deletions.
65 changes: 65 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Lemmatize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
package edu.stanford.nlp.semgraph.semgrex.ssurgeon;

import java.util.Objects;

import edu.stanford.nlp.international.Language;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.IndexedWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.semgrex.SemgrexMatcher;

/**
* Add the output of the English lemmatizer to the word in question.
* Currently this only supports English! You can add a known lemma
* for a different language by using EditNode and setting the lemma
* attribute
*
* @author John Bauer
*/
public class Lemmatize extends SsurgeonEdit {
public static final String LABEL = "lemmatize";

final String nodeName;
final Morphology morphology;
final Language language;

public Lemmatize(String nodeName, Language language) {
if (nodeName == null) {
throw new SsurgeonParseException("Cannot make a Lemmatize with no nodeName");
}
this.nodeName = nodeName;

if (language == Language.UniversalEnglish || language == Language.English) {
this.language = Language.English;
} else if (language == Language.Unknown) {
// log something here?
this.language = Language.English;
} else {
throw new SsurgeonParseException("Lemmatizing " + language + " is not supported");
}

this.morphology = new Morphology();
}

@Override
public String toEditString() {
StringBuilder buf = new StringBuilder();
buf.append(LABEL); buf.append("\t");
buf.append(Ssurgeon.NODENAME_ARG);buf.append(" ");
buf.append(nodeName);
return buf.toString();
}

public boolean evaluate(SemanticGraph sg, SemgrexMatcher sm) {
IndexedWord word = sm.getNode(nodeName);
if (word == null)
return false;

String oldLemma = word.lemma();
morphology.stem(word.backingLabel(), CoreAnnotations.LemmaAnnotation.class);
String newLemma = word.lemma();
boolean changed = !Objects.equals(oldLemma, newLemma);
return changed;
}
}
10 changes: 10 additions & 0 deletions src/edu/stanford/nlp/semgraph/semgrex/ssurgeon/Ssurgeon.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@
* <li> {@code reattachNamedEdge -edge edgename -gov gov -dep dep}
* <li> {@code addDep -gov node1 -reln depType -position where ...attributes...}
* <li> {@code editNode -node node ...attributes...}
* <li> {@code lemmatize -node node}
* <li> {@code combineMWT -node node -word word}
* <li> {@code setRoots n1 (n2 n3 ...)}
* <li> {@code mergeNodes n1 n2}
Expand Down Expand Up @@ -137,6 +138,10 @@
* needs the ability to add or remove features without resetting the entire features map,
* please file an issue on github.
*</p><p>
* {@code lemmatize} will put a lemma on a word.
* {@code -node} is the node to edit.
* This only works on English text.
*</p><p>
* {@code combineMWT} will add MWT attributes to a sequence of two or more words.
* {@code -node} (repeated) is the nodes to edit.
* {@code -word} is the optional text to use for the new MWT. If not set, the words will be concatenated.
Expand Down Expand Up @@ -566,6 +571,11 @@ public static SsurgeonEdit parseEditLine(String editLine, Map<String, String> at
throw new SsurgeonParseException("Cannot make an EditNode out of " + argsBox.nodes.size() + " nodes. Please use exactly one -node");
}
return new EditNode(argsBox.nodes.get(0), argsBox.annotations, argsBox.updateMorphoFeatures);
} else if (command.equalsIgnoreCase(Lemmatize.LABEL)) {
if (argsBox.nodes.size() != 1) {
throw new SsurgeonParseException("Cannot make a Lemmatize out of " + argsBox.nodes.size() + " nodes. Please use exactly one -node");
}
return new Lemmatize(argsBox.nodes.get(0), language);
} else if (command.equalsIgnoreCase(MergeNodes.LABEL)) {
if (argsBox.nodes.size() < 2) {
throw new SsurgeonParseException("Cannot make a MergeNodes out of fewer than 2 nodes (got " + argsBox.nodes.size() + ")");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,61 @@ public void checkAnnotationConversionErrors() {


/**
* Check that the edit which puts a lemma on a node redoes the lemma on the nodes it targets
*/
@Test
public void readXMLLemmatize() {
Ssurgeon inst = Ssurgeon.inst();

// use "dep" as the dependency so as to be language-agnostic in this test
String lemma = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Edit a node</notes>",
" <semgrex>" + XMLUtils.escapeXML("!{lemma:/.+/}=nolemma") + "</semgrex>",
" <edit-list>lemmatize -node nolemma</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
List<SsurgeonPattern> patterns = inst.readFromString(lemma);
assertEquals(patterns.size(), 1);
SsurgeonPattern lemmatizeSsurgeon = patterns.get(0);

SemanticGraph sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
for (IndexedWord word : sg.vertexSet()) {
assertNull(word.lemma());
}
SemanticGraph newSG = lemmatizeSsurgeon.iterate(sg).first;
String[] expectedLemmas = {"Jennifer", "have", "green", "antenna"};
for (IndexedWord word : newSG.vertexSet()) {
assertEquals(expectedLemmas[word.index() - 1], word.lemma());
}

// this version would bomb if lemmatize were not bomb-proof
lemma = String.join(newline,
"<ssurgeon-pattern-list>",
" <ssurgeon-pattern>",
" <uid>38</uid>",
" <notes>Edit a node</notes>",
" <semgrex>" + XMLUtils.escapeXML("{}=nolemma") + "</semgrex>",
" <edit-list>lemmatize -node nolemma</edit-list>",
" </ssurgeon-pattern>",
"</ssurgeon-pattern-list>");
patterns = inst.readFromString(lemma);
assertEquals(patterns.size(), 1);
lemmatizeSsurgeon = patterns.get(0);

sg = SemanticGraph.valueOf("[has/VBZ-2 nsubj> Jennifer/NNP-1 obj> [antennae/NNS-4 dep> green/JJ-3]]");
for (IndexedWord word : sg.vertexSet()) {
assertNull(word.lemma());
}
newSG = lemmatizeSsurgeon.iterate(sg).first;
for (IndexedWord word : newSG.vertexSet()) {
assertEquals(expectedLemmas[word.index() - 1], word.lemma());
}
}

/*
* Check that a basic edit script works as expected
*/
@Test
Expand Down

0 comments on commit c26b25e

Please sign in to comment.