Skip to content

v0.2.55..v0.2.56 changeset AddressScoreExtractor.cpp

Garret Voltz edited this page Aug 14, 2020 · 3 revisions
diff --git a/hoot-core/src/main/cpp/hoot/core/algorithms/extractors/AddressScoreExtractor.cpp b/hoot-core/src/main/cpp/hoot/core/algorithms/extractors/AddressScoreExtractor.cpp
index ca13fbc..4563363 100644
--- a/hoot-core/src/main/cpp/hoot/core/algorithms/extractors/AddressScoreExtractor.cpp
+++ b/hoot-core/src/main/cpp/hoot/core/algorithms/extractors/AddressScoreExtractor.cpp
@@ -33,6 +33,8 @@
 #include <hoot/core/util/ConfigOptions.h>
 #include <hoot/core/conflate/address/Address.h>
 #include <hoot/core/util/StringUtils.h>
+#include <hoot/core/algorithms/string/MeanWordSetDistance.h>
+#include <hoot/core/algorithms/string/LevenshteinDistance.h>
 
 using namespace std;
 
@@ -87,6 +89,61 @@ void AddressScoreExtractor::setConfiguration(const Settings& conf)
   }
 }
 
+double AddressScoreExtractor::extract(const OsmMap& map, const ConstElementPtr& element1,
+                                      const ConstElementPtr& element2) const
+{
+  // Experimented with partial addresses matches in the past and it had no positive affect. Search
+  // the history for this class to see examples, to see if its worth experimenting with again at
+  // some point.
+
+  // see if the first element has any address
+  const QList<Address> element1Addresses = _getElementAddresses(map, element1, element2);
+  LOG_VART(element1Addresses.size());
+  if (element1Addresses.size() == 0)
+  {
+    LOG_TRACE("No element 1 addresses.");
+    return -1.0;
+  }
+
+  // see if the second element has an address
+  const QList<Address> element2Addresses = _getElementAddresses(map, element2, element1);
+  LOG_VART(element2Addresses.size());
+  if (element2Addresses.size() == 0)
+  {
+    LOG_TRACE("No element 2 addresses.");
+    return -1.0;
+  }
+
+  _matchAttemptMade = true;
+  _addressesProcessed += element2Addresses.size();
+  _addressesProcessed += element1Addresses.size();
+
+  // check for address matches
+  for (QList<Address>::const_iterator element2AddrItr = element2Addresses.begin();
+       element2AddrItr != element2Addresses.end(); ++element2AddrItr)
+  {
+    Address element2Address = *element2AddrItr;
+    for (QList<Address>::const_iterator element1AddrItr = element1Addresses.begin();
+         element1AddrItr != element1Addresses.end(); ++element1AddrItr)
+    {
+      Address element1Address = *element1AddrItr;
+      if (element2Address == element1Address)
+      {
+        LOG_TRACE("Found address match: 1: " << element1Address << ", 2: " << element2Address);
+        return 1.0;
+      }
+
+      const double partialMatchScore = _getPartialMatchScore(element1Address, element2Address);
+      if (partialMatchScore > 0.0)
+      {
+        return partialMatchScore;
+      }
+    }
+  }
+
+  return 0.0;
+}
+
 QList<Address> AddressScoreExtractor::_getElementAddresses(
   const OsmMap& map, const ConstElementPtr& element,
   const ConstElementPtr& elementBeingComparedWith) const
@@ -94,16 +151,16 @@ QList<Address> AddressScoreExtractor::_getElementAddresses(
   LOG_TRACE("Collecting addresses from: " << element->getElementId() << "...");
 
   if (_cacheEnabled)
-  { 
+  {
     const QList<Address>* cachedVal = _addressesCache[element->getElementId()];
     if (cachedVal != 0)
     {
+      LOG_TRACE("Found cached address(es): " << *cachedVal);
       _addressCacheHits++;
       return *cachedVal;
     }
   }
 
-  //LOG_VART(element);
   QList<Address> elementAddresses = _addressParser.parseAddresses(*element);
   if (elementAddresses.size() == 0)
   {
@@ -114,6 +171,12 @@ QList<Address> AddressScoreExtractor::_getElementAddresses(
       elementAddresses =
         _addressParser.parseAddressesFromWayNodes(
           *way, map, elementBeingComparedWith->getElementId());
+      if (elementAddresses.size() != 0)
+      {
+        LOG_TRACE(
+          "Found " << elementAddresses.size() << " address(es) on the way nodes of " <<
+          element->getElementId());
+      }
     }
     //if still no luck, try to find the address from a poly way node that is a relation member
     else if (element->getElementType() == ElementType::Relation)
@@ -122,8 +185,20 @@ QList<Address> AddressScoreExtractor::_getElementAddresses(
       elementAddresses =
         _addressParser.parseAddressesFromRelationMembers(
           *relation, map, elementBeingComparedWith->getElementId());
+      if (elementAddresses.size() != 0)
+      {
+        LOG_TRACE(
+          "Found " << elementAddresses.size() << " address(es) on the relation members of " <<
+          element->getElementId());
+      }
     }
   }
+  else
+  {
+    LOG_TRACE(
+      "Found " << elementAddresses.size() << " address(es) on " << element->getElementId() <<
+      ": " << elementAddresses);
+  }
 
   if (_cacheEnabled)
   {
@@ -133,52 +208,153 @@ QList<Address> AddressScoreExtractor::_getElementAddresses(
   return elementAddresses;
 }
 
-double AddressScoreExtractor::extract(const OsmMap& map, const ConstElementPtr& element1,
-                                      const ConstElementPtr& element2) const
+bool AddressScoreExtractor::_addressesMatchWithSuffixesRemoved(
+  const Address& address1, const Address& address2) const
 {
-  //Experimented with partial addresses matches in the past and it had no positive affect.  Search
-  //the history for this class to see examples, to see if its worth experimenting with again at
-  //some point.
+  LOG_TRACE("Attempting intersection match or partial street match without suffix...");
 
-  //see if the first element has any address
-  const QList<Address> element1Addresses = _getElementAddresses(map, element1, element2);
-  LOG_VART(element1Addresses.size());
-  if (element1Addresses.size() == 0)
+  Address elementAddress1Temp = address1;
+  Address elementAddress2Temp = address2;
+  elementAddress1Temp.removeStreetTypes();
+  elementAddress2Temp.removeStreetTypes();
+  LOG_VART(elementAddress1Temp.getAddressStr());
+  LOG_VART(elementAddress2Temp.getAddressStr());
+
+  return elementAddress2Temp == elementAddress1Temp;
+}
+
+bool AddressScoreExtractor::_intersectionAndStreetAddressesMatchWithHouseNumbersRemoved(
+  const Address& address1, const Address& address2) const
+{
+  LOG_TRACE("Attempting street/intersection partial match without house number...");
+
+  const bool element1IsIntersection = Address::isStreetIntersectionAddress(address1);
+  const bool element2IsIntersection = Address::isStreetIntersectionAddress(address2);
+  Address elementAddress1Temp = address1;
+  Address elementAddress2Temp = address2;
+
+  QStringList intersectionParts;
+  if (element1IsIntersection)
   {
-    LOG_TRACE("No element 1 addresses.");
-    return -1.0;
+    intersectionParts = elementAddress1Temp.getIntersectionParts();
   }
-
-  //see if the second element has an address
-  const QList<Address> element2Addresses = _getElementAddresses(map, element2, element1);
-  LOG_VART(element2Addresses.size());
-  if (element2Addresses.size() == 0)
+  else if (element2IsIntersection)
   {
-    LOG_TRACE("No element 2 addresses.");
-    return -1.0;
+    intersectionParts = elementAddress2Temp.getIntersectionParts();
+  }
+  LOG_VART(intersectionParts);
+  QString nonIntersection;
+  if (element1IsIntersection)
+  {
+    elementAddress2Temp.removeHouseNumber();
+    nonIntersection = elementAddress2Temp.getAddressStr();
   }
+  else
+  {
+    elementAddress1Temp.removeHouseNumber();
+    nonIntersection = elementAddress1Temp.getAddressStr();
+  }
+  LOG_VART(nonIntersection);
+  for (int i = 0; i < intersectionParts.size(); i++)
+  {
+    LOG_VART(intersectionParts.at(i).trimmed());
+    if (nonIntersection == intersectionParts.at(i).trimmed())
+    {
+      return true;
+    }
+  }
+  return false;
+}
 
-  _matchAttemptMade = true;
-  _addressesProcessed += element2Addresses.size();
-  _addressesProcessed += element1Addresses.size();
+bool AddressScoreExtractor::_addressesMatchWithNameComparisonRelaxed(
+  const Address& address1, const Address& address2) const
+{
+   LOG_TRACE("Attempting street partial match with looser street name comparison...");
 
-  //check for address matches
-  for (QList<Address>::const_iterator element2AddrItr = element2Addresses.begin();
-       element2AddrItr != element2Addresses.end(); ++element2AddrItr)
+   Address elementAddress1Temp = address1;
+   Address elementAddress2Temp = address2;
+   elementAddress1Temp.removeStreetTypes();
+   elementAddress2Temp.removeStreetTypes();
+
+   elementAddress1Temp.removeHouseNumber();
+   elementAddress2Temp.removeHouseNumber();
+   LOG_VART(elementAddress1Temp.getAddressStr());
+   LOG_VART(elementAddress2Temp.getAddressStr());
+
+   MeanWordSetDistance stringComp(
+     StringDistancePtr(
+       new LevenshteinDistance(ConfigOptions().getLevenshteinDistanceAlpha())));
+   const double stringSim =
+     stringComp.compare(elementAddress1Temp.getAddressStr(), elementAddress2Temp.getAddressStr());
+   LOG_VART(stringSim);
+   return stringSim >= 0.8; // TODO: tie this to a config var?
+}
+
+double AddressScoreExtractor::_getPartialMatchScore(const Address& address1,
+                                                    const Address& address2) const
+{
+  const bool element1IsIntersection = Address::isStreetIntersectionAddress(address1);
+  const bool element2IsIntersection = Address::isStreetIntersectionAddress(address2);
+  const bool onlyOneIsIntersection =
+    (element1IsIntersection && !element2IsIntersection) ||
+    (!element1IsIntersection && element2IsIntersection);
+  LOG_VART(onlyOneIsIntersection);
+
+  // These partial matches (except for the first one) are getting a fairly arbitrary scores, which
+  // could be tweaked going forward. Currently, address partial match scoring is primarily being
+  // used by POI/Polygon conflation to prevent removing reviews for features that have addresses
+  // with some similarity.
+
+  // remove the street types (suffixes) from each and see if we have an address string match
+  if (_addressesMatchWithSuffixesRemoved(address1, address2))
   {
-    const Address element2Address = *element2AddrItr;
-    for (QList<Address>::const_iterator element1AddrItr = element1Addresses.begin();
-         element1AddrItr != element1Addresses.end(); ++element1AddrItr)
+    // If both addresses being compared are intersections and possibly one has street types
+    // in one or both of its intersection parts and the other doesn't, let's try dropping
+    // all street type tokens and comparing the address strings again.
+    if (address1.getParsedFromAddressTag() &&
+        address2.getParsedFromAddressTag() &&
+        element1IsIntersection && element2IsIntersection)
     {
-      const Address element1Address = *element1AddrItr;
-      if (element2Address == element1Address)
-      {
-        LOG_TRACE("Found address match.");
-        return 1.0;
-      }
+      LOG_TRACE(
+        "Found address intersection match after removing suffixes. 1: " <<
+        address1 << ", 2: " << address2);
+      // arguably this could be made into a partial match score like it is for non-intersections
+      return 1.0;
+    }
+    else
+    {
+      LOG_TRACE(
+        "Found partial address match after removing suffixes. 1: " << address1 <<
+         ", 2: " << address2);
+      return 0.8;
     }
   }
 
+  // remove the house numbers from each and see if we have an address string match; only do it if
+  // one of them is an intersection
+  if (onlyOneIsIntersection &&
+      _intersectionAndStreetAddressesMatchWithHouseNumbersRemoved(
+        address1, address2))
+  {
+    LOG_TRACE(
+      "Found partial address intersection/street address match: " << address1 <<
+      ", 2: " << address2);
+    return 0.8;
+  }
+
+  // slight street name misspelling but everything else matches; only do it with basic street
+  // addresses, no intersections, house number ranges or subletters
+  if (!element1IsIntersection && !element2IsIntersection &&
+      !address1.getIsSubLetter() && !address2.getIsSubLetter() &&
+      !address1.getIsRange() && !address2.getIsRange() &&
+      _addressesMatchWithNameComparisonRelaxed(address1, address2))
+  {
+    LOG_TRACE(
+      "Found partial address match based on string similarity. 1: " <<
+      address1 << ", 2: " << address2);
+    return 0.8;
+  }
+
   return 0.0;
 }
 
Clone this wiki locally