stanfordnlp · jrsperry · Oct 4, 2022 · Oct 4, 2022 · Oct 4, 2022 · Oct 8, 2022
diff --git a/data/edu/stanford/nlp/process/ptblexer.gold b/data/edu/stanford/nlp/process/ptblexer.gold
@@ -407,7 +407,8 @@ by
 %
 .
 <p>
-C'mon
+C'm
+on
 Fred
 ,
 you

diff --git a/src/edu/stanford/nlp/ling/tokensregex/CoreMapExpressionExtractor.java b/src/edu/stanford/nlp/ling/tokensregex/CoreMapExpressionExtractor.java
@@ -54,6 +54,8 @@ public class CoreMapExpressionExtractor<T extends MatchedExpression>  {
   private boolean keepTags = false;
   /* Collapses extraction rules - use with care */
   private final boolean collapseExtractionRules;
+  private final boolean keepNestedMatches;
+  private final boolean keepOverlappingMatches;
   private final Class<CoreAnnotation<List<? extends CoreMap>>> tokensAnnotationKey;
   private final Map<Integer, Stage<T>> stages;
 
@@ -136,11 +138,16 @@ public CoreMapExpressionExtractor(Env env) {
     this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
     if (env != null) {
       this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
+      this.keepNestedMatches = Objects.equals((Boolean) env.get("keepNestedMatches"), true);
+      this.keepOverlappingMatches = Objects.equals((Boolean) env.get("keepOverlappingMatches"), true);
+
       if (env.get("verbose") != null)
         verbose =  (env.get("verbose") != null) &&
                 Objects.equals((Boolean) env.get("verbose"), true);
     } else {
       this.collapseExtractionRules = false;
+      this.keepNestedMatches = false;
+      this.keepOverlappingMatches = false;
     }
   }
 
@@ -433,13 +440,13 @@ private Pair<List<? extends CoreMap>, List<T>> applyCompositeRule(
         annotateExpressions(merged, newExprs);
         newExprs = MatchedExpression.removeNullValues(newExprs);
         if ( ! newExprs.isEmpty()) {
-          newExprs = MatchedExpression.removeNested(newExprs);
-          newExprs = MatchedExpression.removeOverlapping(newExprs);
+          newExprs = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
+          newExprs = this.keepOverlappingMatches ? newExprs : MatchedExpression.removeOverlapping(newExprs);
           merged = MatchedExpression.replaceMerged(merged, newExprs);
           // Favor newly matched expressions over older ones
           newExprs.addAll(matchedExpressions);
-          matchedExpressions = MatchedExpression.removeNested(newExprs);
-          matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
+          matchedExpressions = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
+          matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
         } else {
           extracted = false;
         }
@@ -486,8 +493,8 @@ public List<T> extractExpressions(CoreMap annotation) {
         }
         annotateExpressions(annotation, matchedExpressions);
         matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions);
-        matchedExpressions = MatchedExpression.removeNested(matchedExpressions);
-        matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
+        matchedExpressions = this.keepNestedMatches ? matchedExpressions : MatchedExpression.removeNested(matchedExpressions);
+        matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
       }
 
       List<? extends CoreMap> merged = MatchedExpression.replaceMergedUsingTokenOffsets(annotation.get(tokensAnnotationKey), matchedExpressions);

diff --git a/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java b/src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java
@@ -277,6 +277,10 @@ public String getText() {
     return text;
   }
 
+  public String getName() {
+    return this.extractFunc.name;
+  }
+
   public CoreMap getAnnotation() {
     return annotation;
   }

diff --git a/src/edu/stanford/nlp/parser/metrics/EvaluateTreebank.java b/src/edu/stanford/nlp/parser/metrics/EvaluateTreebank.java
@@ -681,8 +681,10 @@ public double testOnTreebank(List<Pair<ParserQuery, Tree>> testTreebank) {
   }
 
   public double testOnTreebank(EvaluationDataset testTreebank) {
-    log.info("Testing on treebank");
-    Timing treebankTotalTimer = new Timing();
+    if (summary || !op.testOptions.quietEvaluation) {
+      log.info("Testing on treebank");
+    }
+    Timing treebankTotalTimer = (summary || !op.testOptions.quietEvaluation) ? new Timing() : null;
     TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
     TreebankLangParserParams tlpParams = op.tlpParams;
     TreebankLanguagePack tlp = op.langpack();
@@ -729,7 +731,9 @@ public double testOnTreebank(EvaluationDataset testTreebank) {
     }
 
     //Done parsing...print the results of the evaluations
-    treebankTotalTimer.done("Testing on treebank");
+    if (treebankTotalTimer != null) {
+      treebankTotalTimer.done("Testing on treebank");
+    }
     if (op.testOptions.quietEvaluation) {
       pwErr = tlpParams.pw(System.err);
     }

diff --git a/src/edu/stanford/nlp/process/PTBLexer.flex b/src/edu/stanford/nlp/process/PTBLexer.flex
@@ -587,7 +587,7 @@ SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+
 FRAC = ({DIGIT}{1,4}[- \u00A0])?{DIGIT}{1,4}(\\?\/|\u2044){DIGIT}{1,3}(,{DIGIT}{3}|{DIGIT})?
 FRAC2 = [\u00BC\u00BD\u00BE\u2150-\u215E\u2189]
 /* # is here for historical reasons -- old UK ASCII-equivalent used # for pound mark. Bit ugly now. Allow $$$ */
-DOLSIGN = ([A-Z]*\$|#|\$\$\$)
+DOLSIGN = ([A-Z]*\$|#|\$\$+)
 /* Currency: These are cent, pound, currency, yen; CP1252 euro; ECU and many other currency simples including Euro;
    armenian dram, afghani, bengali rupee, thai bhat; full-wdith dollar, cent pound, yen, won */
 DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
@@ -804,7 +804,7 @@ ASSIMILATIONS3 = cannot|'twas|dunno|['’]d['’]ve
 /* Assimilations2 leave 2 chars behind after division */
 /* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
 /* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
-ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre
+ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre|c{APOS}mon
 
 /* CP1252: dagger, double dagger, per mille, bullet, small tilde, trademark */
 CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]