From 5e57eaba40897ee93b69ed3f11bda511f6b427d8 Mon Sep 17 00:00:00 2001 From: John Bauer Date: Wed, 20 Mar 2024 08:47:51 -0700 Subject: [PATCH] Adding PTB Corrector as an option reduces the total validation errors in the PTB conversion to dependencies by about 250. Weirdly this is by removing 280 syntax errors and adding 40 morpho errors for aux verbs. Presumably those should be fixable. Of course, there is always more that can be done - there are now 2622 errors left when using the converter. https://github.com/UniversalDependencies/docs/issues/717 --- .../nlp/trees/ud/UniversalDependenciesConverter.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java index 5e4eb5ad7b..337d8b5cf2 100644 --- a/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java +++ b/src/edu/stanford/nlp/trees/ud/UniversalDependenciesConverter.java @@ -6,6 +6,7 @@ import edu.stanford.nlp.semgraph.SemanticGraph; import edu.stanford.nlp.semgraph.SemanticGraphFactory; import edu.stanford.nlp.trees.*; +import edu.stanford.nlp.trees.treebank.EnglishPTBTreebankCorrector; import edu.stanford.nlp.util.Pair; import edu.stanford.nlp.util.PropertiesUtils; import edu.stanford.nlp.util.StringUtils; @@ -229,6 +230,7 @@ private static void addSpaceAfter(SemanticGraph sg, String text, int graphIdx) { * {@code -textFile}: A file with text to be used as a guide for SpaceAfter (optional)
* {@code -outputRepresentation}: "basic" (default), "enhanced", or "enhanced++"
* {@code -combineMWTs}: "False" (default), "True" marks things like it's as MWT + * {@code -correctPTB}: "False" (default), "True" runs the PTB Corrector over the trees */ public static void main(String[] args) { Properties props = StringUtils.argsToProperties(args); @@ -239,6 +241,7 @@ public static void main(String[] args) { boolean addFeatures = PropertiesUtils.getBool(props, "addFeatures", false); boolean combineMWTs = PropertiesUtils.getBool(props, "combineMWTs", false); boolean replaceLemmata = PropertiesUtils.getBool(props, "replaceLemmata", false); + boolean correctPTB = PropertiesUtils.getBool(props, "correctPTB", false); Iterator> sgIterator; // = null; @@ -258,7 +261,7 @@ public static void main(String[] args) { System.err.println("No input file specified!"); System.err.println(); System.err.printf("Usage: java %s [-treeFile trees.tree | -conlluFile deptrees.conllu]" + - " [-addFeatures] [-replaceLemmata] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n", + " [-addFeatures] [-replaceLemmata] [-correctPTB] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n", UniversalDependenciesConverter.class.getCanonicalName()); return; } @@ -271,6 +274,7 @@ public static void main(String[] args) { UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null; EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null; + EnglishPTBTreebankCorrector ptbCorrector = (correctPTB) ? new EnglishPTBTreebankCorrector() : null; CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter(); @@ -282,6 +286,9 @@ public static void main(String[] args) { if (treeFileName != null) { //add UPOS tags Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree(); + if (ptbCorrector != null) { + tree = ptbCorrector.transformTree(tree); + } Tree uposTree = UniversalPOSMapper.mapTree(tree); List