Skip to content

v0.2.55..v0.2.56 changeset AddressNormalizer.cpp

Garret Voltz edited this page Aug 14, 2020 · 3 revisions
diff --git a/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp b/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
index d173ae8..2c220c3 100644
--- a/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
+++ b/hoot-core/src/main/cpp/hoot/core/conflate/address/AddressNormalizer.cpp
@@ -30,7 +30,8 @@
 // hoot
 #include <hoot/core/util/Log.h>
 #include <hoot/core/conflate/address/LibPostalInit.h>
-#include <hoot/core/conflate/address/AddressTagKeys.h>
+#include <hoot/core/conflate/address/Address.h>
+#include <hoot/core/util/StringUtils.h>
 
 // libpostal
 #include <libpostal/libpostal.h>
@@ -41,11 +42,12 @@ namespace hoot
 AddressNormalizer::AddressNormalizer() :
 _numNormalized(0)
 {
+  _addressTagKeys.reset(new AddressTagKeys());
 }
 
 void AddressNormalizer::normalizeAddresses(const ElementPtr& e)
 {
-  const QSet<QString> addressTagKeys = AddressTagKeys::getInstance().getAddressTagKeys(*e);
+  const QSet<QString> addressTagKeys = _addressTagKeys->getAddressTagKeys(*e);
   LOG_VART(addressTagKeys);
   for (QSet<QString>::const_iterator addressTagKeyItr = addressTagKeys.begin();
        addressTagKeyItr != addressTagKeys.end(); ++addressTagKeyItr)
@@ -84,13 +86,30 @@ void AddressNormalizer::normalizeAddresses(const ElementPtr& e)
 
 QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
 {
+  const QString addressToNormalize = address.trimmed().simplified();
+  if (!Address::isStreetIntersectionAddress(addressToNormalize))
+  {
+    return _normalizeAddressWithLibPostal(addressToNormalize);
+  }
+  else
+  {
+    // libpostal doesn't handle intersections very well, so doing intersection normalization with
+    // custom logic
+    return _normalizeAddressIntersection(addressToNormalize);
+  }
+}
+
+QSet<QString> AddressNormalizer::_normalizeAddressWithLibPostal(const QString& address) const
+{
+  LOG_TRACE("Normalizing " << address << " with libpostal...");
+
+  QSet<QString> normalizedAddresses;
+  QString addressCopy = address;
+
   // See note about init of this in AddressParser::parseAddresses.
   LibPostalInit::getInstance();
 
-  LOG_VART(address);
-  const QString addressToNormalize = address.trimmed().simplified();
-  LOG_VART(addressToNormalize);
-  QSet<QString> normalizedAddresses;
+  _prepareAddressForLibPostalNormalization(addressCopy);
 
   size_t num_expansions;
   // specifying a language in the options is optional, but could we get better performance if
@@ -98,14 +117,13 @@ QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
   // first, of course)?
   char** expansions =
     libpostal_expand_address(
-      addressToNormalize.toUtf8().data(), libpostal_get_default_options(),
-      &num_expansions);
+      addressCopy.toUtf8().data(), libpostal_get_default_options(), &num_expansions);
   // add all the normalizations libpostal finds as possible addresses
   for (size_t i = 0; i < num_expansions; i++)
   {
     const QString normalizedAddress = QString::fromUtf8(expansions[i]);
     LOG_VART(normalizedAddress);
-    if (_isValidNormalizedAddress(addressToNormalize, normalizedAddress) &&
+    if (_isValidNormalizedAddress(addressCopy, normalizedAddress) &&
         !normalizedAddresses.contains(normalizedAddress))
     {
       normalizedAddresses.insert(normalizedAddress);
@@ -122,6 +140,164 @@ QSet<QString> AddressNormalizer::normalizeAddress(const QString& address) const
   return normalizedAddresses;
 }
 
+QSet<QString> AddressNormalizer::_normalizeAddressIntersection(const QString& address) const
+{
+  LOG_TRACE("Normalizing intersection: " << address << "...");
+
+  const QMap<QString, QString> streetTypeAbbreviationsToFullTypes =
+    Address::getStreetTypeAbbreviationsToFullTypes();
+  const QStringList addressParts =
+    StringUtils::splitOnAny(address, Address::getIntersectionSplitTokens(), 2);
+  LOG_VART(addressParts.size());
+
+  if (addressParts.size() != 2)
+  {
+    throw IllegalArgumentException(
+      "A non-intersection address was passed into street intersection address normalization.");
+  }
+
+  // replace all street type abbreviations in both intersection parts in the address with their full
+  // name counterparts
+
+  QString modifiedAddress;
+  for (int i = 0; i < addressParts.size(); i++)
+  {
+    QString addressPart = addressParts.at(i).trimmed();
+    LOG_VART(addressPart);
+    for (QMap<QString, QString>::const_iterator itr = streetTypeAbbreviationsToFullTypes.begin();
+         itr != streetTypeAbbreviationsToFullTypes.end(); ++itr)
+    {
+      const QString abbrev = itr.key().trimmed();
+      LOG_VART(abbrev);
+      const QString fullType = itr.value().trimmed();
+      LOG_VART(fullType);
+
+      LOG_VART(addressPart.endsWith(abbrev, Qt::CaseInsensitive));
+      if (addressPart.endsWith(abbrev, Qt::CaseInsensitive))
+      {
+        StringUtils::replaceLastIndexOf(addressPart, abbrev, fullType);
+        LOG_VART(addressPart);
+      }
+    }
+    LOG_VART(addressPart);
+
+    modifiedAddress += addressPart.trimmed();
+    if (i == 0)
+    {
+      modifiedAddress += " and ";
+    }
+  }
+  LOG_VART(modifiedAddress);
+
+  // If one of the intersection parts has a street type and the other doesn't we'll copy one street
+  // type over to the other. This isn't foolproof as we could end up giving one of the intersections
+  // an incorrect street type (the actual element's address tags never get modified, though).
+  // However, it does help with address matching and will remain in place unless its found to be
+  // causing harm in some way.
+
+  QStringList modifiedAddressParts =
+    StringUtils::splitOnAny(modifiedAddress, Address::getIntersectionSplitTokens(), 2);
+  assert(modifiedAddressParts.size() == 2);
+  LOG_VART(modifiedAddressParts);
+  QString firstIntersectionPart = modifiedAddressParts[0].trimmed();
+  LOG_VART(firstIntersectionPart);
+  QString secondIntersectionPart = modifiedAddressParts[1].trimmed();
+  LOG_VART(secondIntersectionPart);
+
+  QStringList streetFullTypes;
+  QStringList streetPluralTypes;
+  const QStringList streetFullTypesTemp =
+    Address::getStreetFullTypesToTypeAbbreviations().keys();
+  streetFullTypes = streetFullTypesTemp;
+
+  // Sometimes intersections have plural street types (suffixes). We want to be able to handle those
+  // as well, but we don't want them plural in the final normalized address.
+  for (int i = 0; i < streetFullTypesTemp.size(); i++)
+  {
+    const QString pluralType = streetFullTypesTemp.at(i) + "s";
+    if (!streetFullTypes.contains(pluralType))
+    {
+      streetFullTypes.append(pluralType);
+    }
+    streetPluralTypes.append(pluralType);
+  }
+  LOG_VART(streetPluralTypes);
+
+  // remove any plural suffixes found; TODO: may not need all of this here
+  if (StringUtils::endsWithAny(firstIntersectionPart.trimmed(), streetPluralTypes))
+  {
+    firstIntersectionPart.chop(1);
+    LOG_VART(firstIntersectionPart);
+  }
+  if (StringUtils::endsWithAny(secondIntersectionPart.trimmed(), streetPluralTypes))
+  {
+    secondIntersectionPart.chop(1);
+    LOG_VART(secondIntersectionPart);
+  }
+  QStringList modifiedAddressPartsTemp;
+  for (int i = 0; i < modifiedAddressParts.size(); i++)
+  {
+    QString modifiedAddressPart = modifiedAddressParts.at(i);
+    if (StringUtils::endsWithAny(modifiedAddressPart.trimmed(), streetPluralTypes))
+    {
+      modifiedAddressPart.chop(1);
+      LOG_VART(modifiedAddressPart);
+    }
+    modifiedAddressPartsTemp.append(modifiedAddressPart);
+  }
+  modifiedAddressParts = modifiedAddressPartsTemp;
+  QString firstIntersectionEndingStreetType =
+    StringUtils::endsWithAnyAsStr(firstIntersectionPart.trimmed(), streetFullTypes).trimmed();
+  if (firstIntersectionEndingStreetType.endsWith('s'))
+  {
+    firstIntersectionEndingStreetType.chop(1);
+  }
+  LOG_VART(firstIntersectionEndingStreetType);
+  QString secondIntersectionEndingStreetType =
+    StringUtils::endsWithAnyAsStr(secondIntersectionPart.trimmed(), streetFullTypes).trimmed();
+  if (secondIntersectionEndingStreetType.endsWith('s'))
+  {
+    secondIntersectionEndingStreetType.chop(1);
+  }
+  LOG_VART(secondIntersectionEndingStreetType);
+
+  if (!firstIntersectionEndingStreetType.isEmpty() &&
+      secondIntersectionEndingStreetType.isEmpty())
+  {
+    LOG_VART(modifiedAddressParts[1]);
+    modifiedAddressParts[1] =
+      modifiedAddressParts[1].trimmed() + " " + firstIntersectionEndingStreetType.trimmed();
+    LOG_VART(modifiedAddressParts[1]);
+  }
+  else if (firstIntersectionEndingStreetType.isEmpty() &&
+           !secondIntersectionEndingStreetType.isEmpty())
+  {
+    LOG_VART(modifiedAddressParts[0]);
+    modifiedAddressParts[0] =
+      modifiedAddressParts[0].trimmed() + " " + secondIntersectionEndingStreetType.trimmed();
+    LOG_VART(modifiedAddressParts[0]);
+  }
+  modifiedAddress = modifiedAddressParts[0].trimmed() + " and " + modifiedAddressParts[1].trimmed();
+  LOG_VART(modifiedAddress);
+
+  // go ahead and send it to libpostal to finish out the normalization to avoid duplicating some
+  // code, although it probably won't change any from this point
+  return _normalizeAddressWithLibPostal(modifiedAddress);
+}
+
+void AddressNormalizer::_prepareAddressForLibPostalNormalization(QString& address)
+{
+  LOG_TRACE("Before normalization fix: " << address);
+  LOG_VART(Address::isStreetIntersectionAddress(address));
+  // This is a nasty thing libpostal does where it changes "St" to "Saint" when it should be
+  // "Street".
+  if (address.endsWith("st", Qt::CaseInsensitive) && !Address::isStreetIntersectionAddress(address))
+  {
+    StringUtils::replaceLastIndexOf(address, "st", "Street");
+  }
+  LOG_TRACE("After normalization fix: " << address);
+}
+
 bool AddressNormalizer::_isValidNormalizedAddress(const QString& inputAddress,
                                                   const QString& normalizedAddress)
 {
@@ -131,7 +307,8 @@ bool AddressNormalizer::_isValidNormalizedAddress(const QString& inputAddress,
     return false;
   }
   // This is a bit of hack, but I don't like the way libpostal is turning "St" or "Street" into
-  // "Saint".  Should probably look into configuration of libpostal for a possible fix instead.
+  // "Saint". Should probably look into configuration of libpostal or update it for a possible fix
+  // instead.
   else if (normalizedAddress.endsWith("saint", Qt::CaseInsensitive) &&
            (inputAddress.endsWith("street", Qt::CaseInsensitive) ||
             inputAddress.endsWith("st", Qt::CaseInsensitive)))
Clone this wiki locally