diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java index bff6a49d03..768d4b3543 100644 --- a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java +++ b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.java @@ -230,6 +230,27 @@ public StanfordCoreNLP(Properties props, boolean enforceRequirements, AnnotatorP } this.properties = props; // from now on we use this.properties + // alter annotator list if preTokenized option is set + // preTokenized means just split input text on white space and one sentence per line + if (PropertiesUtils.getBool(this.properties, ("preTokenized"))) { + this.properties.setProperty("tokenize.whitespace", "true"); + this.properties.setProperty("ssplit.eolonly", "true"); + String oldAnnotators = this.properties.getProperty("annotators").replaceAll("\\s+", ""); + String newAnnotators = oldAnnotators; + if (oldAnnotators != null && oldAnnotators.startsWith("cdc_tokenize")) { + newAnnotators = "tokenize,ssplit" + oldAnnotators.substring(12,oldAnnotators.length()); + logger.info("preTokenized option set: Changing annotators cdc_tokenize to tokenize,ssplit"); + } else if (oldAnnotators != null && oldAnnotators.startsWith("tokenize,ssplit,mwt")) { + newAnnotators = "tokenize,ssplit" + oldAnnotators.substring(19,oldAnnotators.length()); + logger.info("preTokenized option set: Changing annotators tokenize,ssplit,mwt to tokenize,ssplit"); + } else if (oldAnnotators != null && oldAnnotators.startsWith("tokenize,ssplit")) { + logger.info("preTokenized option set: Annotators list starts with tokenize,ssplit, no change needed."); + } else { + logger.warn("preTokenized option set: Non-standard annotators list, preTokenized may not work in this case."); + } + this.properties.setProperty("annotators", newAnnotators); + } + // cdm [2017]: constructAnnotatorPool (PropertiesUtils.getSignature) requires non-null Properties, so after properties setup this.pool = annotatorPool != null ? annotatorPool : constructAnnotatorPool(props, getAnnotatorImplementations());