Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
The following is a summary of the new features and changes in Baleen 2.2.0. There may be additional changes and features. Please refer to the diff and commit logs for full details. New core features * All entities now have a sub-type * Added gender to Person * Baleen Jobs framework * Plankton visual pipeline tool New collection readers and improvements to existing collection readers * EmailReader * FolderReader now accepts a regular expression to filter against, rather than a file extension * MucReader * ReutersReader New annotators and improvements to existing annotators * Added nautical miles to Distance regex * CorefBrackets cleaner (replaces CorefLocationCoordinate cleaner) * Coreference annotators and sieves * Improvements to LatLon annotator * Interaction annotators * Keyword extraction annotators (RakeKeywords and CommonKeywords) * Relationship annotators * NPVNP * SimpleInteraction * UbmreConstituent * UbmbreDependency * Rewrite of MoneyRegex to fix issues with previous version * USTelephone New consumers and improvements to existing consumers * CSV Consumers * Elasticsearch upgraded to Elasticsearch 2 * ElasticsearchRest * MongoPatternSaver * Print consumers to output information to the console New jobs * Interactions jobs * MongoStats New resources * SharedStopwordResource * SharedWordNetResource Bug fixes, improved unit testing, updated dependencies and reductions to technical debt Please be aware that some aspects of this release may not be backwards compatible with previous versions.
- Loading branch information
1 parent
19e1689
commit af9abb2
Showing
462 changed files
with
29,913 additions
and
4,616 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
73 changes: 73 additions & 0 deletions
73
...een-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/AddTitleToPerson.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
package uk.gov.dstl.baleen.annotators.cleaners; | ||
|
||
import java.util.Collection; | ||
|
||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.fit.util.JCasUtil; | ||
import org.apache.uima.jcas.JCas; | ||
|
||
import uk.gov.dstl.baleen.annotators.grammatical.NPTitleEntity; | ||
import uk.gov.dstl.baleen.types.common.Person; | ||
import uk.gov.dstl.baleen.uima.BaleenAnnotator; | ||
|
||
/** | ||
* Add title (mr, president, etc) information to previously found people. | ||
* <p> | ||
* Often with NLP models we find a person, e.g. John Smith but omit the title information, e.g. | ||
* General John Smith, General Sir John Smith. This annotator adds that information back onto the | ||
* entity, thus improving the quality of person extraction and reducing the number of unannotated | ||
* words in a document. | ||
* | ||
* @baleen.javadoc | ||
*/ | ||
public class AddTitleToPerson extends BaleenAnnotator { | ||
|
||
@Override | ||
protected void doProcess(JCas jCas) throws AnalysisEngineProcessException { | ||
// We copy this array as we'll modify people as we go | ||
Collection<Person> people = JCasUtil.select(jCas, Person.class); | ||
|
||
for (Person p : people) { | ||
while(makeReplacement(jCas, p)){ | ||
//Make as many replacements as possible, to capture things like Sir Major General Smith. | ||
} | ||
} | ||
} | ||
|
||
private boolean makeReplacement(JCas jCas, Person p){ | ||
boolean replacementMade = false; | ||
|
||
for(String title : NPTitleEntity.TITLES){ | ||
if(p.getBegin() - title.length() - 1 < 0) | ||
continue; | ||
|
||
String precedingText = jCas.getDocumentText().substring(p.getBegin() - title.length() - 1, p.getBegin() - 1); | ||
if(title.equalsIgnoreCase(precedingText)){ | ||
p.setBegin(p.getBegin() - title.length() - 1); | ||
p.setTitle(extendTitle(precedingText, p.getTitle())); | ||
|
||
replacementMade = true; | ||
} | ||
} | ||
|
||
return replacementMade; | ||
} | ||
|
||
/** | ||
* Add the prefix to the existing title. | ||
* | ||
* @param prefix | ||
* the prefix | ||
* @param title | ||
* the title | ||
* @return the string | ||
*/ | ||
private String extendTitle(String prefix, String title) { | ||
if (title == null || title.isEmpty()) { | ||
return prefix; | ||
} else { | ||
return prefix + " " + title; | ||
} | ||
} | ||
|
||
} |
133 changes: 133 additions & 0 deletions
133
...-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NaiveMergeRelations.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
package uk.gov.dstl.baleen.annotators.cleaners; | ||
|
||
import java.util.ArrayList; | ||
import java.util.HashSet; | ||
import java.util.List; | ||
import java.util.Set; | ||
import java.util.stream.Collectors; | ||
|
||
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | ||
import org.apache.uima.fit.descriptor.ConfigurationParameter; | ||
import org.apache.uima.fit.util.JCasUtil; | ||
import org.apache.uima.jcas.JCas; | ||
|
||
import uk.gov.dstl.baleen.types.semantic.Entity; | ||
import uk.gov.dstl.baleen.types.semantic.Relation; | ||
import uk.gov.dstl.baleen.uima.BaleenAnnotator; | ||
|
||
/** | ||
* Removes multiple copies of the same relation within a document. | ||
* | ||
* This is a naive and simple approach which can hide many issues - it is effectively performing | ||
* relationship coreference and deduplication based solely at a relationship level. The algorithm | ||
* works by looking is the relationship types are the same, and if the entities are the same (here | ||
* as well is difficult, this is based on entities having the same type and value which may be | ||
* incorrect for multiple John Smiths). | ||
* | ||
* This only really useful if you want to ensure that from a single document you get only a single | ||
* relationship of the same type, subtype between the same two entities because you want to | ||
* (naively) push data into database and not have to consider this in future algorithms (focusing on | ||
* counting the same relations appearing in different documents). | ||
* | ||
*/ | ||
public class NaiveMergeRelations extends BaleenAnnotator { | ||
|
||
/** | ||
* Symmetric relations (x ~ y and y ~ x are considered the same) if true | ||
* | ||
* @baleen.config true | ||
*/ | ||
public static final String KEY_SYMMETRIC = "symmetric"; | ||
@ConfigurationParameter(name = KEY_SYMMETRIC, defaultValue = "true") | ||
private Boolean symmetric; | ||
|
||
@Override | ||
protected void doProcess(final JCas jCas) throws AnalysisEngineProcessException { | ||
final List<Relation> relations = new ArrayList<>(JCasUtil.select(jCas, Relation.class)); | ||
|
||
final Set<Relation> toRemove = new HashSet<>(); | ||
|
||
for (int i = 0; i < relations.size(); i++) { | ||
final Relation a = relations.get(i); | ||
|
||
if (!toRemove.contains(a)) { | ||
toRemove.addAll(findSameRelations(a, relations.subList(i + 1, relations.size()))); | ||
} | ||
} | ||
|
||
removeFromJCasIndex(toRemove); | ||
} | ||
|
||
/** | ||
* Finds any relations from the list <em>relations</em> that is the same as <em>a</em> | ||
*/ | ||
private List<Relation> findSameRelations(Relation a, List<Relation> relations){ | ||
return relations.stream().filter(b -> isSame(a, b)).collect(Collectors.toList()); | ||
} | ||
|
||
/** | ||
* Checks if relations are the same. | ||
* | ||
* @param a | ||
* the first relation | ||
* @param b | ||
* the second relation | ||
* @return true, if is same | ||
*/ | ||
private boolean isSame(final Relation a, final Relation b) { | ||
boolean sameSourceTarget = false; | ||
if(isSame(a.getSource(), b.getSource()) && isSame(a.getTarget(), b.getTarget())){ | ||
sameSourceTarget = true; | ||
}else if(symmetric && isSame(a.getSource(), b.getTarget()) && isSame(a.getTarget(), b.getSource())){ | ||
//Symmetric, so source and target could be switched | ||
sameSourceTarget = true; | ||
} | ||
|
||
return sameSourceTarget | ||
&& isSame(a.getRelationshipType(), b.getRelationshipType()) | ||
&& isSame(a.getRelationSubType(), b.getRelationSubType()); | ||
} | ||
|
||
/** | ||
* Checks if entity is the same | ||
* | ||
* @param a | ||
* the first entity | ||
* @param b | ||
* the second entity | ||
* @return true, if is same | ||
*/ | ||
private boolean isSame(final Entity a, final Entity b) { | ||
if (a == null && b == null) { | ||
return true; | ||
} | ||
|
||
if (a == null || b == null) { | ||
// implies b != null (as a != b) | ||
return false; | ||
} | ||
|
||
// TODO: is the value test enough? | ||
return a.getType().equals(b.getType()) && isSame(a.getValue(), b.getValue()); | ||
} | ||
|
||
/** | ||
* Checks if two strings are the same. | ||
* | ||
* @param a | ||
* first string | ||
* @param b | ||
* second string | ||
* @return true, if is same | ||
*/ | ||
private boolean isSame(final String a, final String b) { | ||
if (a == null && b == null) { | ||
return true; | ||
} else if (a == null || b == null) { | ||
return false; | ||
} else { | ||
return a.equalsIgnoreCase(b); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.