Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable keeping matched expressions. #1308

Open
wants to merge 7 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion data/edu/stanford/nlp/process/ptblexer.gold
Original file line number Diff line number Diff line change
Expand Up @@ -407,7 +407,8 @@ by
%
.
<p>
C'mon
C'm
on
Fred
,
you
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public class CoreMapExpressionExtractor<T extends MatchedExpression> {
private boolean keepTags = false;
/* Collapses extraction rules - use with care */
private final boolean collapseExtractionRules;
private final boolean keepNestedMatches;
private final boolean keepOverlappingMatches;
private final Class<CoreAnnotation<List<? extends CoreMap>>> tokensAnnotationKey;
private final Map<Integer, Stage<T>> stages;

Expand Down Expand Up @@ -136,11 +138,16 @@ public CoreMapExpressionExtractor(Env env) {
this.tokensAnnotationKey = EnvLookup.getDefaultTokensAnnotationKey(env);
if (env != null) {
this.collapseExtractionRules = Objects.equals((Boolean) env.get("collapseExtractionRules"), true);
this.keepNestedMatches = Objects.equals((Boolean) env.get("keepNestedMatches"), true);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The truth is, I know almost nothing about tokensregex, and I would say the people still at Stanford are generally in the same boat. Why is it necessary to use Objects.equals here? I see that it was done for a different Boolean earlier as well

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also, should the default be false instead? that way the behavior doesn't change for existing use cases

this.keepOverlappingMatches = Objects.equals((Boolean) env.get("keepOverlappingMatches"), true);

if (env.get("verbose") != null)
verbose = (env.get("verbose") != null) &&
Objects.equals((Boolean) env.get("verbose"), true);
} else {
this.collapseExtractionRules = false;
this.keepNestedMatches = false;
this.keepOverlappingMatches = false;
}
}

Expand Down Expand Up @@ -433,13 +440,13 @@ private Pair<List<? extends CoreMap>, List<T>> applyCompositeRule(
annotateExpressions(merged, newExprs);
newExprs = MatchedExpression.removeNullValues(newExprs);
if ( ! newExprs.isEmpty()) {
newExprs = MatchedExpression.removeNested(newExprs);
newExprs = MatchedExpression.removeOverlapping(newExprs);
newExprs = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
newExprs = this.keepOverlappingMatches ? newExprs : MatchedExpression.removeOverlapping(newExprs);
merged = MatchedExpression.replaceMerged(merged, newExprs);
// Favor newly matched expressions over older ones
newExprs.addAll(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(newExprs);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
matchedExpressions = this.keepNestedMatches ? newExprs : MatchedExpression.removeNested(newExprs);
matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
} else {
extracted = false;
}
Expand Down Expand Up @@ -486,8 +493,8 @@ public List<T> extractExpressions(CoreMap annotation) {
}
annotateExpressions(annotation, matchedExpressions);
matchedExpressions = MatchedExpression.removeNullValues(matchedExpressions);
matchedExpressions = MatchedExpression.removeNested(matchedExpressions);
matchedExpressions = MatchedExpression.removeOverlapping(matchedExpressions);
matchedExpressions = this.keepNestedMatches ? matchedExpressions : MatchedExpression.removeNested(matchedExpressions);
matchedExpressions = this.keepOverlappingMatches ? matchedExpressions : MatchedExpression.removeOverlapping(matchedExpressions);
}

List<? extends CoreMap> merged = MatchedExpression.replaceMergedUsingTokenOffsets(annotation.get(tokensAnnotationKey), matchedExpressions);
Expand Down
4 changes: 4 additions & 0 deletions src/edu/stanford/nlp/ling/tokensregex/MatchedExpression.java
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,10 @@ public String getText() {
return text;
}

public String getName() {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is this related?

return this.extractFunc.name;
}

public CoreMap getAnnotation() {
return annotation;
}
Expand Down
10 changes: 7 additions & 3 deletions src/edu/stanford/nlp/parser/metrics/EvaluateTreebank.java
Original file line number Diff line number Diff line change
Expand Up @@ -681,8 +681,10 @@ public double testOnTreebank(List<Pair<ParserQuery, Tree>> testTreebank) {
}

public double testOnTreebank(EvaluationDataset testTreebank) {
log.info("Testing on treebank");
Timing treebankTotalTimer = new Timing();
if (summary || !op.testOptions.quietEvaluation) {
log.info("Testing on treebank");
}
Timing treebankTotalTimer = (summary || !op.testOptions.quietEvaluation) ? new Timing() : null;
TreePrint treePrint = op.testOptions.treePrint(op.tlpParams);
TreebankLangParserParams tlpParams = op.tlpParams;
TreebankLanguagePack tlp = op.langpack();
Expand Down Expand Up @@ -729,7 +731,9 @@ public double testOnTreebank(EvaluationDataset testTreebank) {
}

//Done parsing...print the results of the evaluations
treebankTotalTimer.done("Testing on treebank");
if (treebankTotalTimer != null) {
treebankTotalTimer.done("Testing on treebank");
}
if (op.testOptions.quietEvaluation) {
pwErr = tlpParams.pw(System.err);
}
Expand Down
4 changes: 2 additions & 2 deletions src/edu/stanford/nlp/process/PTBLexer.flex
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ SUBSUPNUM = [\u207A\u207B\u208A\u208B]?([\u2070\u00B9\u00B2\u00B3\u2074-\u2079]+
FRAC = ({DIGIT}{1,4}[- \u00A0])?{DIGIT}{1,4}(\\?\/|\u2044){DIGIT}{1,3}(,{DIGIT}{3}|{DIGIT})?
FRAC2 = [\u00BC\u00BD\u00BE\u2150-\u215E\u2189]
/* # is here for historical reasons -- old UK ASCII-equivalent used # for pound mark. Bit ugly now. Allow $$$ */
DOLSIGN = ([A-Z]*\$|#|\$\$\$)
DOLSIGN = ([A-Z]*\$|#|\$\$+)
/* Currency: These are cent, pound, currency, yen; CP1252 euro; ECU and many other currency simples including Euro;
armenian dram, afghani, bengali rupee, thai bhat; full-wdith dollar, cent pound, yen, won */
DOLSIGN2 = [\u00A2-\u00A5\u0080\u20A0-\u20BF\u058F\u060B\u09F2\u09F3\u0AF1\u0BF9\u0E3F\u17DB\uFF04\uFFE0\uFFE1\uFFE5\uFFE6]
Expand Down Expand Up @@ -804,7 +804,7 @@ ASSIMILATIONS3 = cannot|'twas|dunno|['’]d['’]ve
/* Assimilations2 leave 2 chars behind after division */
/* "nno" is a remnant after pushing back from dunno in ASSIMILATIONS3 */
/* Include splitting some apostrophe-less negations, but not ones like "wont" that are also words. */
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre
ASSIMILATIONS2 = {APOS}tis|gonna|gotta|lemme|gimme|wanna|nno|aint|dont|doesnt|didnt|theyre|c{APOS}mon

/* CP1252: dagger, double dagger, per mille, bullet, small tilde, trademark */
CP1252_MISC_SYMBOL = [\u0086\u0087\u0089\u0095\u0098\u0099]
Expand Down