Skip to content

Commit

Permalink
Adding PTB Corrector as an option reduces the total validation errors…
Browse files Browse the repository at this point in the history
… in the PTB conversion to dependencies by about 250. Weirdly this is by removing 280 syntax errors and adding 40 morpho errors for aux verbs. Presumably those should be fixable. Of course, there is always more that can be done - there are now 2622 errors left when using the converter. UniversalDependencies/docs#717
  • Loading branch information
AngledLuffa committed Mar 20, 2024
1 parent 1f11e3f commit 5e57eab
Showing 1 changed file with 8 additions and 1 deletion.
Expand Up @@ -6,6 +6,7 @@
import edu.stanford.nlp.semgraph.SemanticGraph;
import edu.stanford.nlp.semgraph.SemanticGraphFactory;
import edu.stanford.nlp.trees.*;
import edu.stanford.nlp.trees.treebank.EnglishPTBTreebankCorrector;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
Expand Down Expand Up @@ -229,6 +230,7 @@ private static void addSpaceAfter(SemanticGraph sg, String text, int graphIdx) {
* {@code -textFile}: A file with text to be used as a guide for SpaceAfter (optional)<br>
* {@code -outputRepresentation}: "basic" (default), "enhanced", or "enhanced++"<br>
* {@code -combineMWTs}: "False" (default), "True" marks things like it's as MWT
* {@code -correctPTB}: "False" (default), "True" runs the PTB Corrector over the trees
*/
public static void main(String[] args) {
Properties props = StringUtils.argsToProperties(args);
Expand All @@ -239,6 +241,7 @@ public static void main(String[] args) {
boolean addFeatures = PropertiesUtils.getBool(props, "addFeatures", false);
boolean combineMWTs = PropertiesUtils.getBool(props, "combineMWTs", false);
boolean replaceLemmata = PropertiesUtils.getBool(props, "replaceLemmata", false);
boolean correctPTB = PropertiesUtils.getBool(props, "correctPTB", false);

Iterator<Pair<SemanticGraph, SemanticGraph>> sgIterator; // = null;

Expand All @@ -258,7 +261,7 @@ public static void main(String[] args) {
System.err.println("No input file specified!");
System.err.println();
System.err.printf("Usage: java %s [-treeFile trees.tree | -conlluFile deptrees.conllu]" +
" [-addFeatures] [-replaceLemmata] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n",
" [-addFeatures] [-replaceLemmata] [-correctPTB] [-textFile trees.txt] [-outputRepresentation basic|enhanced|enhanced++ (default: basic)]%n",
UniversalDependenciesConverter.class.getCanonicalName());
return;
}
Expand All @@ -271,6 +274,7 @@ public static void main(String[] args) {

UniversalDependenciesFeatureAnnotator featureAnnotator = (addFeatures) ? new UniversalDependenciesFeatureAnnotator() : null;
EnglishMWTCombiner mwtCombiner = (combineMWTs) ? new EnglishMWTCombiner() : null;
EnglishPTBTreebankCorrector ptbCorrector = (correctPTB) ? new EnglishPTBTreebankCorrector() : null;

CoNLLUDocumentWriter writer = new CoNLLUDocumentWriter();

Expand All @@ -282,6 +286,9 @@ public static void main(String[] args) {
if (treeFileName != null) {
//add UPOS tags
Tree tree = ((TreeToSemanticGraphIterator) sgIterator).getCurrentTree();
if (ptbCorrector != null) {
tree = ptbCorrector.transformTree(tree);
}
Tree uposTree = UniversalPOSMapper.mapTree(tree);
List<Label> uposLabels = uposTree.preTerminalYield();
for (IndexedWord token: sg.vertexListSorted()) {
Expand Down

0 comments on commit 5e57eab

Please sign in to comment.