Merge pull request #25 from dstl/pr/24

Modifications to PR24 (Provide Normalisation Cleaners)
dstl · May 3, 2016 · 19e1689 · 19e1689
2 parents c15565a + c34c108
commit 19e1689
Show file tree

Hide file tree

Showing 20 changed files with 1,705 additions and 86 deletions.
diff --git a/...aleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeDates.java b/...aleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeDates.java
@@ -0,0 +1,151 @@
+package uk.gov.dstl.baleen.annotators.cleaners;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Date;
+import java.util.Locale;
+
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+
+import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNormalizeEntities;
+import uk.gov.dstl.baleen.types.semantic.Entity;
+import uk.gov.dstl.baleen.types.temporal.DateType;
+
+/**
+ * Formats the value of the DateType entities to a standard form for consistent representation
+ * on export. If it is unable to parse the value then it returns the original string. Significantly
+ * if avoids formatting dates where the actual reference is ambiguous and this includes all dates
+ * with only a two digit year specifier, given that the century is indeterminable. An assumption
+ * could be made but the user with the rest of the document contents is better placed to make it.
+ * 
+ * @baleen.javadoc
+ */
+public class NormalizeDates extends AbstractNormalizeEntities {
+
+	/**
+	 * What is the format that the dates should be normalized to? The default value follows the 
+	 * ISO8601 standard.
+	 * @baleen.config yyyy'-'MM'-'dd
+	 */
+	public static final String PARAM_DATE_FORMAT = "correctFormat";
+	@ConfigurationParameter(name = PARAM_DATE_FORMAT, defaultValue = "yyyy'-'MM'-'dd")
+	String correctFormat;
+
+	private boolean orderAmbiguityFlag = false;
+
+	@Override
+	protected String normalize(Entity e) {
+		String dateString = e.getValue();
+
+		/*Flag is reset to avoid giving a false positive for the date that is about to be
+		* processed. Good practice for calling classes would be to get this flag as soon
+		* after the call to normalize the date as is possible, if it is desired.
+		*/
+		orderAmbiguityFlag = false;
+
+		//Format the received string to remove suffixes, day names and normalise delimiters
+		String cleanedDateString = dateString.replaceAll("[./-]", " ");
+		cleanedDateString = cleanedDateString.replaceAll("(Mon|Tues|Wed|Wednes|Thurs|Fri|Sat|Satur|Sun)(day)? ", "");
+		cleanedDateString = cleanedDateString.replaceAll("[ ]{2,}", " ");
+
+		SimpleDateFormat formatter = new SimpleDateFormat("d' 'M' 'yyyy", Locale.ENGLISH);
+
+		//Pattern catches numeric dates in possible British order day, month, year
+		if (cleanedDateString.matches("(0?[1-9]|[12][0-9]|3[01]) (0?[1-9]|1[012]) [0-9]{4}")) {
+
+			if (cleanedDateString.matches("(0?[1-9]|1[012]) (0?[1-9]|1[012]) [0-9]{4}")) {
+				orderAmbiguityFlag = true;
+			}
+
+			try {
+				Date date = formatter.parse(cleanedDateString);
+				formatter.applyPattern(correctFormat);
+				return formatter.format(date);
+			}
+			catch (ParseException exception) {
+				return dateString;
+			}
+
+		}
+		//Pattern catches numeric dates definitely in American order month, day, year
+		else if (cleanedDateString.matches("(0?[1-9]|1[012]) (0?[1-9]|[12][0-9]|3[01]) [0-9]{4}")) {
+
+			try {
+				formatter.applyPattern("M' 'd' 'yyyy")	;
+				Date date = formatter.parse(cleanedDateString);
+				formatter.applyPattern(correctFormat);
+				return formatter.format(date);
+			}
+			catch (ParseException exception) {
+				return dateString;
+			}
+
+		}
+		//Pattern catches numeric dates in ISO standard order year, month, day
+		else if (cleanedDateString.matches("[0-9]{4} [0-1]?[0-9] [0-9]{1,2}")) {
+
+			try {
+				formatter.applyPattern("yyyy' 'M' 'd")	;
+				Date date = formatter.parse(cleanedDateString);
+				formatter.applyPattern(correctFormat);
+				return formatter.format(date);
+			}
+			catch (ParseException exception) {
+				return dateString;
+			}
+
+		}
+		else {
+			cleanedDateString = cleanedDateString.replaceAll("(nd|rd|th)", "");
+			cleanedDateString = cleanedDateString.replaceAll("1st", "1");
+			formatter.applyPattern("d' 'MMM' 'yyyy");
+
+			//Pattern matches dates in British order day, month, year
+			if (cleanedDateString.matches("(0?[1-9]|[12][0-9]|3[01]) .+ [0-9]{4}")) {
+
+				try {
+					Date date = formatter.parse(cleanedDateString);
+					formatter.applyPattern(correctFormat);
+					return formatter.format(date);
+				}
+				catch (ParseException exception) {
+					return dateString;
+				}
+
+			}
+			//Pattern catches dates in American order month, day, year
+			else if (cleanedDateString.matches(".+ (0?[1-9]|[12][0-9]|3[01]) [0-9]{4}")){
+
+				try {
+					formatter.applyPattern("MMM' 'd' 'yyyy");
+					Date date = formatter.parse(cleanedDateString);
+					formatter.applyPattern(correctFormat);;
+					return formatter.format(date);
+				}
+				catch (ParseException exception) {
+					return dateString;
+				}
+			}
+			else {
+				return dateString;
+			}
+		}
+
+	}
+
+	/**
+	 * Method to get the value of the orderAmbiguityFlag.
+	 * 
+	 * @return orderAmbiguityFlag is a boolean that is true if the most recently parsed date
+	 * is ambiguous as to whether it is ordered Day/Month/Year or Month/Day/Year
+	 */
+	public boolean getOrderAmbiguityFlag() {
+		return orderAmbiguityFlag; 
+	}
+
+	@Override
+	protected boolean shouldNormalize(Entity e) {
+		return e instanceof DateType;
+	}
+
+}
diff --git a/...baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeOSGB.java b/...baleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeOSGB.java
@@ -0,0 +1,28 @@
+package uk.gov.dstl.baleen.annotators.cleaners;
+
+import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNormalizeEntities;
+import uk.gov.dstl.baleen.types.geo.Coordinate;
+import uk.gov.dstl.baleen.types.semantic.Entity;
+
+/**
+ * Formats the value of the OSGB entities to be consistent for export and entity
+ * matching between documents. The format is two upper case letters followed by
+ * an even digit number with no whitespace or other characters.
+ * 
+ * @baleen.javadoc
+ */
+public class NormalizeOSGB extends AbstractNormalizeEntities {
+
+	@Override
+	protected String normalize(Entity e) {
+		String osgb = e.getValue();
+		osgb = osgb.replaceAll("[\\s]", "");
+		osgb = osgb.toUpperCase();
+		return osgb;
+	}
+
+	@Override
+	protected boolean shouldNormalize(Entity e) {
+		return ((e instanceof Coordinate) && (e.getSubType().equals("osgb")));
+	}
+}
diff --git a/...aleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeTimes.java b/...aleen-annotators/src/main/java/uk/gov/dstl/baleen/annotators/cleaners/NormalizeTimes.java
@@ -0,0 +1,181 @@
+package uk.gov.dstl.baleen.annotators.cleaners;
+
+import java.text.ParseException;
+import java.text.SimpleDateFormat;
+import java.util.Arrays;
+import java.util.Date;
+import java.util.TimeZone;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+
+import org.apache.commons.lang.StringUtils;
+
+import uk.gov.dstl.baleen.annotators.cleaners.helpers.AbstractNormalizeEntities;
+import uk.gov.dstl.baleen.types.semantic.Entity;
+import uk.gov.dstl.baleen.types.temporal.Time;
+
+/**
+ * Edits the value field of Time entities (separate from time entities which are a quantity) to a
+ * consistent format, for ease of entity comparison between documents and consistent data representation
+ * on export. The format is set as the 24 hour representation of the time followed by an upper-case acronym
+ * of the time zone +/- an integer number of hours as an offset. The zone and offset are both optional,
+ * only preserved from the original value, never added or changed to be in a different zone and separated
+ * from the time by a single space. Zones are only recognised if they are an acronym in the set of zone IDs
+ * available from the TimeZone class.
+ * 
+ * @baleen.javadoc
+ */
+public class NormalizeTimes extends AbstractNormalizeEntities{
+
+	/* Create a list of time zone acronyms to use in the regular expressions. This is the same list as is used
+	 * by the time regex annotator to find times.
+	 */
+	private static final String TIME_ZONES = StringUtils.join(
+			Arrays.asList(TimeZone.getAvailableIDs())
+				.stream().filter(s -> StringUtils.isAllUpperCase(s) && s.length() <= 3)
+				.collect(Collectors.toList()),"|");
+
+	private static final String TO24HR = "HH:mm";
+
+	@Override
+	protected String normalize(Entity e) {
+		String timeString = e.getValue();
+		SimpleDateFormat formatter = new SimpleDateFormat("yy H:m a z");
+		Date timeObj = null;
+
+		//Pattern catches keywords with a known time
+		Pattern pKeyWords = Pattern.compile("(midday)|(noon)|(midnight)",Pattern.CASE_INSENSITIVE);
+		Matcher mKeyWords = pKeyWords.matcher(timeString);
+
+		//One of the regex patterns used by the time annotator. Captures 12/24 hour time formats with/without
+		//am/pm marker or time zone.
+		Pattern p12Hour = Pattern.compile("\\b((([0-1]?[0-9])|2[0-4])[:\\.][0-5][0-9])\\h*(("
+				+ TIME_ZONES + ")([ ]?[+-][ ]?((0?[0-9])|(1[0-2])))?)?\\h*(pm|am)?\\b",Pattern.CASE_INSENSITIVE);
+		Matcher m12Hour = p12Hour.matcher(timeString);
+
+		//Another of the regex patterns, this captures simple 12 hour format times with no minute values
+		Pattern pSimple = Pattern.compile("((1[0-2])|([1-9]))(pm|am)",Pattern.CASE_INSENSITIVE);
+		Matcher mSimple = pSimple.matcher(timeString);
+
+		if (mKeyWords.find()) {
+			if (mKeyWords.group(3) == null) {
+				return "12:00";
+			} else {
+				return "00:00";
+			}
+		}
+		else if (m12Hour.matches()) {
+			/* the prefix '15 ' is used to set the year. This is an arbitrary value, special only in that it
+			 * is not the default year the formatter chooses, 1970. Using this default year, times with the GMT
+			 * timeObj zone specified are interpreted with +1 hour e.g. 01:00 GMT AM is interpreted as 02:00 GMT
+			 * AM by the formatter. This may have something to do with the Double Summer Time experiment which
+			 * ran from 1968 - 1971.
+			 */
+			StringBuilder timeSb = new StringBuilder("15 " + m12Hour.group(1).replaceAll("\\.", ":"));
+
+			if (m12Hour.group(10) != null) {
+				timeSb.append(" " + m12Hour.group(10));
+
+				try {
+					if (m12Hour.group(5) != null) {
+						/* Neither "H:m a" or "h:m a" can correctly identify all timeStrings. The former will parse
+						 * 03:01 pm as 03:01 am whilst the latter will interpret 17:01 pm as 05:01 am. The following
+						 * check switches the pattern depending on whether the hours part of the string denote
+						 * an integer that is < than 12.
+						 */
+						if (Integer.parseInt(timeSb.substring(3, timeSb.length() - 6)) < 12)
+							formatter.applyPattern("yy K:m a z");
+
+						timeSb.append(" " + m12Hour.group(5));
+						timeObj = formatter.parse(timeSb.toString());
+						formatter.applyPattern(TO24HR + " z");
+					} else {
+						formatter.applyPattern("yy H:m a");
+						if (Integer.parseInt(timeSb.substring(3, timeSb.length() - 6)) < 12)
+							formatter.applyPattern("yy K:m a");
+
+						timeObj = formatter.parse(timeSb.toString());
+						formatter.applyPattern(TO24HR);
+					}
+				} catch (ParseException exception) {
+					getMonitor().warn("Parse exception occurred at {} for time {}", exception.getErrorOffset(), timeSb.toString(), exception);
+					return timeString;
+				}
+
+			} else {
+
+				try {
+					if (m12Hour.group(5) != null) {
+						timeSb.append(" " + m12Hour.group(5));
+						formatter.applyPattern("yy H:m z");
+						timeObj = formatter.parse(timeSb.toString());
+						formatter.applyPattern(TO24HR + " z");
+					} else {
+						formatter.applyPattern("yy H:m");
+						timeObj = formatter.parse(timeSb.toString());
+						formatter.applyPattern(TO24HR);
+					}
+				} catch (ParseException exception) {
+					getMonitor().warn("Parse exception occurred at {} for time {}", exception.getErrorOffset(), timeSb.toString(), exception);
+					return timeString;
+				}
+
+			}
+
+			//Replace string builder contents with formatted output
+			timeSb.delete(0, timeSb.length());
+			timeSb.append(formatter.format(timeObj));
+
+			if (m12Hour.group(6) != null) {
+				String cleanedSubstring = m12Hour.group(6).replaceAll("[\\s]", "");
+				cleanedSubstring = cleanedSubstring.replaceAll("\\+0", "+");
+				cleanedSubstring = cleanedSubstring.replaceAll("-0", "-");
+				timeSb.append(cleanedSubstring);
+			}
+
+			return timeSb.toString();
+		}
+		else if (mSimple.matches()) {
+
+			try {
+				formatter.applyPattern("hha");
+				timeObj = formatter.parse(timeString);
+				formatter.applyPattern(TO24HR);
+			} catch (ParseException exception) {
+				getMonitor().warn("Parse exception occurred at {} for time {}", exception.getErrorOffset(), timeString, exception);
+				return timeString;
+			}
+
+			return formatter.format(timeObj);
+		} 
+		//This default scenario handles the remaining 24 hour formats: '2400 time zone (+offset)?' or '2400 h/hours'
+		else {
+			StringBuilder timeSb = new StringBuilder(timeString.substring(0, 2) + ":" + timeString.substring(2, 4));
+
+			//Correct this particular case
+			if (timeString.substring(0,2).equals("24"))
+				timeSb.replace(0, 2, "00");
+
+			String cleanedSubstring = timeString.substring(4,timeString.length()).replaceAll("[\\s]", "");
+			cleanedSubstring = cleanedSubstring.replaceAll("\\+0", "+");
+			cleanedSubstring = cleanedSubstring.replaceAll("-0", "-");
+			cleanedSubstring = cleanedSubstring.toUpperCase();
+
+			//In this case the substring is not part of the standard output so drop it
+			if (cleanedSubstring.matches("H|HOURS")) {
+				return timeSb.toString();
+			} else {
+				cleanedSubstring = cleanedSubstring.replaceAll("\\+$", "");	//In case of +0 offsets don't leave a dangling '+'
+				timeSb.append(" " + cleanedSubstring);
+				return timeSb.toString();
+			}
+
+		}
+	}
+
+	@Override
+	protected boolean shouldNormalize(Entity e) {
+		return (e instanceof Time);
+	}
+}