Skip to content

Commit

Permalink
add preTokenized option
Browse files Browse the repository at this point in the history
  • Loading branch information
J38 committed Jan 11, 2022
1 parent 7a3bcc1 commit 647339a
Showing 1 changed file with 21 additions and 0 deletions.
21 changes: 21 additions & 0 deletions src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java
Expand Up @@ -230,6 +230,27 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP
}
this.properties = props; // from now on we use this.properties

// alter annotator list if preTokenized option is set
// preTokenized means just split input text on white space and one sentence per line
if (PropertiesUtils.getBool(this.properties, ("preTokenized"))) {
this.properties.setProperty("tokenize.whitespace", "true");
this.properties.setProperty("ssplit.eolonly", "true");
String oldAnnotators = this.properties.getProperty("annotators").replaceAll("\\s+", "");
String newAnnotators = oldAnnotators;
if (oldAnnotators != null && oldAnnotators.startsWith("cdc_tokenize")) {
newAnnotators = "tokenize,ssplit" + oldAnnotators.substring(12,oldAnnotators.length());
logger.info("preTokenized option set: Changing annotators cdc_tokenize to tokenize,ssplit");
} else if (oldAnnotators != null && oldAnnotators.startsWith("tokenize,ssplit,mwt")) {
newAnnotators = "tokenize,ssplit" + oldAnnotators.substring(19,oldAnnotators.length());
logger.info("preTokenized option set: Changing annotators tokenize,ssplit,mwt to tokenize,ssplit");
} else if (oldAnnotators != null && oldAnnotators.startsWith("tokenize,ssplit")) {
logger.info("preTokenized option set: Annotators list starts with tokenize,ssplit, no change needed.");
} else {
logger.warn("preTokenized option set: Non-standard annotators list, preTokenized may not work in this case.");
}
this.properties.setProperty("annotators", newAnnotators);
}

// cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup
this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());

Expand Down

0 comments on commit 647339a

Please sign in to comment.