Skip to content

v0.2.54..v0.2.55 changeset ElementDeduplicator.cpp

Garret Voltz edited this page Aug 14, 2020 · 1 revision
diff --git a/hoot-core/src/main/cpp/hoot/core/elements/ElementDeduplicator.cpp b/hoot-core/src/main/cpp/hoot/core/elements/ElementDeduplicator.cpp
new file mode 100644
index 0000000..c317dd4
--- /dev/null
+++ b/hoot-core/src/main/cpp/hoot/core/elements/ElementDeduplicator.cpp
@@ -0,0 +1,397 @@
+/*
+ * This file is part of Hootenanny.
+ *
+ * Hootenanny is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --------------------------------------------------------------------
+ *
+ * The following copyright notices are generated automatically. If you
+ * have a new notice to add, please use the format:
+ * " * @copyright Copyright ..."
+ * This will properly maintain the copyright information. DigitalGlobe
+ * copyrights will be updated automatically.
+ *
+ * @copyright Copyright (C) 2020 DigitalGlobe (http://www.digitalglobe.com/)
+ */
+
+#include "ElementDeduplicator.h"
+
+// Hoot
+#include <hoot/core/util/HootException.h>
+#include <hoot/core/util/Log.h>
+#include <hoot/core/visitors/ElementHashVisitor.h>
+#include <hoot/core/elements/WayUtils.h>
+#include <hoot/core/ops/RemoveElementByEid.h>
+
+namespace hoot
+{
+
+ElementDeduplicator::ElementDeduplicator() :
+_dedupeIntraMap(true),
+_dedupeNodes(true),
+_dedupeWays(true),
+_dedupeRelations(true),
+_favorMoreConnectedWays(false)
+{
+}
+
+void ElementDeduplicator::dedupe(OsmMapPtr map)
+{
+  _validateInputs();
+
+  LOG_STATUS("De-duping intra-map: " << map->getName() << "...");
+  LOG_DEBUG(map->getName() << " size before de-duping: " << map->size());
+
+  const long nodesBefore = map->getNodeCount();
+  const long waysBefore = map->getWayCount();
+  const long relationsBefore = map->getRelationCount();
+
+  // calculate our unique hashes per element and get a list of duplicate pairs within the map
+  QMap<QString, ElementId> mapHashes;
+  QSet<std::pair<ElementId, ElementId>> duplicates;
+  _calcElementHashes(map, mapHashes, duplicates);
+  QSet<QString> mapHashesSet = mapHashes.keys().toSet();
+  LOG_VARD(mapHashesSet.size());
+
+  // convert the duplicate pairs to the IDs of the element we actually want to remove, sorted by
+  // element type and then remove them
+  const QMap<ElementType::Type, QSet<ElementId>> elementsToRemove = _dupesToElementIds(duplicates);
+  if (_dedupeRelations)
+  {
+    _removeElements(elementsToRemove[ElementType::Relation], map);
+  }
+  if (_dedupeWays)
+  {
+    _removeElements(elementsToRemove[ElementType::Way], map);
+  }
+  if (_dedupeNodes)
+  {
+    _removeElements(elementsToRemove[ElementType::Node], map);
+  }
+
+  LOG_DEBUG(map->getName() << " size after de-duping: " << map->size());
+  LOG_STATUS(
+    "Removed " << nodesBefore - map->getNodeCount() << " duplicate nodes from " <<
+    map->getName());
+  LOG_STATUS(
+    "Removed " << waysBefore - map->getWayCount() << " duplicate ways from " <<
+    map->getName());
+  LOG_STATUS(
+    "Removed " << relationsBefore - map->getRelationCount() << " duplicate relations from " <<
+    map->getName());
+}
+
+void ElementDeduplicator::dedupe(OsmMapPtr map1, OsmMapPtr map2)
+{
+  _validateInputs();
+
+  LOG_STATUS("De-duping map: " << map1->getName() << " and " << map2->getName() << "...");
+  LOG_DEBUG(map1->getName() << " size before de-duping: " << map1->size());
+  LOG_DEBUG(map2->getName() << " size before de-duping: " << map2->size());
+
+  const long map1NodesBefore = map1->getNodeCount();
+  const long map1WaysBefore = map1->getWayCount();
+  const long map1RelationsBefore = map1->getRelationCount();
+  const long map2NodesBefore = map2->getNodeCount();
+  const long map2WaysBefore = map2->getWayCount();
+  const long map2RelationsBefore = map2->getRelationCount();
+
+  // calculate our unique hashes per element for each map and get a list of duplicate pairs within
+  // each map
+
+  QMap<QString, ElementId> map1Hashes;
+  QSet<std::pair<ElementId, ElementId>> duplicates1;
+  _calcElementHashes(map1, map1Hashes, duplicates1);
+  QSet<QString> map1HashesSet = map1Hashes.keys().toSet();
+  LOG_VARD(map1HashesSet.size());
+
+  QMap<QString, ElementId> map2Hashes;
+  QSet<std::pair<ElementId, ElementId>> duplicates2;
+  _calcElementHashes(map2, map2Hashes, duplicates2);
+  QSet<QString> map2HashesSet = map2Hashes.keys().toSet();
+  LOG_VARD(map2HashesSet.size());
+
+  QMap<ElementType::Type, QSet<ElementId>> elementsToRemove;
+  QMap<ElementId, QString> elementIdsToRemoveFromMap;
+
+  if (_dedupeIntraMap)
+  {
+    // remove the dupes within each map
+
+    LOG_DEBUG("Recording " << map1->getName() << " duplicates...");
+    _dupesToElementIdsCheckMap(
+      duplicates1, map1, map2, elementsToRemove, elementIdsToRemoveFromMap);
+
+    LOG_DEBUG("Recording " << map2->getName() << " duplicates...");
+    _dupesToElementIdsCheckMap(
+      duplicates2, map1, map2, elementsToRemove, elementIdsToRemoveFromMap);
+  }
+
+  // now, calculate the duplicates when comparing the map's between each other; we'll arbitrarily
+  // remove features from the second map except where _favorMoreConnectedWays has influence if it
+  // has been enabled
+  _dupeHashesToElementIdsCheckMap(
+    map1HashesSet.intersect(map2HashesSet), map1, map2, map1Hashes, map2Hashes, elementsToRemove,
+    elementIdsToRemoveFromMap);
+
+  if (_dedupeRelations)
+  {
+    _removeElements(elementsToRemove[ElementType::Relation], map2);
+  }
+  if (_dedupeWays)
+  {
+    LOG_DEBUG("Removing duplicate ways...");
+    if (!_favorMoreConnectedWays)
+    {
+      _removeElements(elementsToRemove[ElementType::Way], map2);
+    }
+    else
+    {
+      _removeWaysCheckMap(
+        elementsToRemove[ElementType::Way], map1, map2, elementIdsToRemoveFromMap);
+    }
+  }
+  if (_dedupeNodes)
+  {
+    _removeElements(elementsToRemove[ElementType::Node], map2);
+  }
+
+  LOG_DEBUG(map1->getName() << " size after de-duping: " << map1->size());
+  LOG_DEBUG(map2->getName() << " size after de-duping: " << map2->size());
+  LOG_STATUS(
+    "Removed " << map1NodesBefore - map1->getNodeCount() << " duplicate nodes from " <<
+    map1->getName());
+  LOG_STATUS(
+    "Removed " << map1WaysBefore - map1->getWayCount() << " duplicate ways from " <<
+    map1->getName());
+  LOG_STATUS(
+    "Removed " << map1RelationsBefore - map1->getRelationCount() << " duplicate relations from " <<
+    map1->getName());
+  LOG_STATUS(
+    "Removed " << map2NodesBefore - map2->getNodeCount() << " duplicate nodes from " <<
+    map2->getName());
+  LOG_STATUS(
+    "Removed " << map2WaysBefore - map2->getWayCount() << " duplicate ways from " <<
+    map2->getName());
+  LOG_STATUS(
+    "Removed " << map2RelationsBefore - map2->getRelationCount() << " duplicate relations from " <<
+    map2->getName());
+}
+
+void ElementDeduplicator::_validateInputs()
+{
+  if (!_dedupeNodes && !_dedupeWays && !_dedupeRelations)
+  {
+    throw IllegalArgumentException("All element types set to be skipped for de-duplication.");
+  }
+}
+
+void ElementDeduplicator::_calcElementHashes(
+  OsmMapPtr map, QMap<QString, ElementId>& hashes,
+  QSet<std::pair<ElementId, ElementId>>& duplicates)
+{
+  ElementHashVisitor hashVis;
+  hashVis.setWriteHashes(false);
+  hashVis.setCollectHashes(true);
+
+  LOG_DEBUG("Calculating " << map->getName() << " element hashes...");
+  hashVis.setOsmMap(map.get());
+  map->visitRw(hashVis);
+  hashes = hashVis.getHashes();
+  duplicates = hashVis.getDuplicates();
+  LOG_VARD(duplicates.size());
+}
+
+void ElementDeduplicator::_removeElements(const QSet<ElementId>& elementsToRemove, OsmMapPtr map)
+{
+  if (elementsToRemove.size() == 0)
+  {
+    return;
+  }
+
+  // This works b/c we're sending in all ids of the same element type.
+  const ElementId id = *elementsToRemove.begin();
+  LOG_DEBUG(
+    "Removing duplicate " << id.getType().toString() << "s from " << map->getName() << "...");
+
+  ElementCriterionPtr crit;
+  if (id.getType() == ElementType::Node && _nodeCrit)
+  {
+    crit = _nodeCrit;
+  }
+  else if (id.getType() == ElementType::Way && _wayCrit)
+  {
+    crit = _wayCrit;
+  }
+  if (id.getType() == ElementType::Relation && _relationCrit)
+  {
+    crit = _relationCrit;
+  }
+
+  for (QSet<ElementId>::const_iterator itr = elementsToRemove.begin();
+       itr != elementsToRemove.end(); ++itr)
+  {
+    const ElementId elementId = *itr;
+    ConstElementPtr element = map->getElement(elementId);
+    if (element && (!crit || crit->isSatisfied(element)))
+    {
+      LOG_TRACE("Removing " << elementId << "...");
+      RemoveElementByEid::removeElementNoCheck(map, elementId);
+    }
+  }
+}
+
+void ElementDeduplicator::_removeWaysCheckMap(
+  const QSet<ElementId>& waysToRemove, OsmMapPtr map1, OsmMapPtr map2,
+  const QMap<ElementId, QString>& elementIdsToRemoveFromMap)
+{
+  LOG_VARD(waysToRemove.size());
+
+  for (QSet<ElementId>::const_iterator itr = waysToRemove.begin();
+       itr != waysToRemove.end(); ++itr)
+  {
+    const ElementId elementId = *itr;
+    assert(elementId.getType() == ElementType::Way);
+
+    OsmMapPtr mapToRemoveFrom;
+    if (elementIdsToRemoveFromMap[elementId] == "1")
+    {
+      mapToRemoveFrom = map1;
+    }
+    else
+    {
+      mapToRemoveFrom = map2;
+    }
+
+    ConstElementPtr element = mapToRemoveFrom->getElement(elementId);
+    if (element && (!_wayCrit || _wayCrit->isSatisfied(element)))
+    {
+      LOG_TRACE("Removing " << elementId << " from " << mapToRemoveFrom->getName() << "...");
+      RemoveElementByEid::removeElementNoCheck(mapToRemoveFrom, elementId);
+    }
+  }
+}
+
+QMap<ElementType::Type, QSet<ElementId>> ElementDeduplicator::_dupesToElementIds(
+  const QSet<std::pair<ElementId, ElementId>>& duplicates)
+{
+  QMap<ElementType::Type, QSet<ElementId>> elementsToRemove;
+
+  for (QSet<std::pair<ElementId, ElementId>>::const_iterator itr = duplicates.begin();
+       itr != duplicates.end(); ++itr)
+  {
+    const std::pair<ElementId, ElementId> dupes = *itr;
+    const ElementId dupe1 = dupes.first;
+    LOG_VART(dupe1);
+    const ElementId dupe2 = dupes.second;
+    LOG_VART(dupe2);
+    const ElementType elementType = dupe1.getType();
+    LOG_VART(elementType);
+    assert(elementType == dupe2.getType());
+
+    elementsToRemove[elementType.getEnum()].insert(dupe1);
+  }
+
+  return elementsToRemove;
+}
+
+void ElementDeduplicator::_dupesToElementIdsCheckMap(
+  const QSet<std::pair<ElementId, ElementId>>& duplicates, OsmMapPtr map1, OsmMapPtr map2,
+  QMap<ElementType::Type, QSet<ElementId>>& elementsToRemove,
+  QMap<ElementId, QString>& elementIdsToRemoveFromMap)
+{
+  for (QSet<std::pair<ElementId, ElementId>>::const_iterator itr = duplicates.begin();
+         itr != duplicates.end(); ++itr)
+  {
+    const std::pair<ElementId, ElementId> dupes = *itr;
+    const ElementId dupe1 = dupes.first;
+    LOG_VART(dupe1);
+    const ElementId dupe2 = dupes.second;
+    LOG_VART(dupe2);
+    const ElementType elementType = dupe1.getType();
+    LOG_VART(elementType);
+    assert(elementType == dupe2.getType());
+
+    if (elementType == ElementType::Way && _favorMoreConnectedWays)
+    {
+      const int numConnectedTo1 = WayUtils::getNumberOfConnectedWays(dupe1.getId(), map1);
+      LOG_VART(numConnectedTo1);
+      const int numConnectedTo2 = WayUtils::getNumberOfConnectedWays(dupe2.getId(), map2);
+      LOG_VART(numConnectedTo2);
+      if (numConnectedTo1 >= numConnectedTo2)
+      {
+        elementIdsToRemoveFromMap[dupe1] = "2";
+        elementsToRemove[elementType.getEnum()].insert(dupe2);
+      }
+      else
+      {
+        elementIdsToRemoveFromMap[dupe2] = "1";
+        elementsToRemove[elementType.getEnum()].insert(dupe1);
+      }
+    }
+    else
+    {
+      // We always favor the first map when the number of connected ways aren't a factor, so record
+      // the second map's element as the one to be removed.
+      elementsToRemove[elementType.getEnum()].insert(dupe2);
+    }
+  }
+}
+
+void ElementDeduplicator::_dupeHashesToElementIdsCheckMap(
+  const QSet<QString>& sharedHashes, OsmMapPtr map1, OsmMapPtr map2,
+  const QMap<QString, ElementId>& map1Hashes, const QMap<QString, ElementId>& map2Hashes,
+  QMap<ElementType::Type, QSet<ElementId>>& elementsToRemove,
+  QMap<ElementId, QString>& elementIdsToRemoveFromMap)
+{
+  LOG_DEBUG(
+    "Calculating duplicates between " << map1->getName() << " and " << map2->getName() << "...");
+  LOG_VARD(sharedHashes.size());
+  for (QSet<QString>::const_iterator itr = sharedHashes.begin(); itr != sharedHashes.end(); ++itr)
+  {
+    const QString sharedHash = *itr;
+    const ElementId toRemove1 = map1Hashes[sharedHash];
+    LOG_VART(toRemove1);
+    const ElementId toRemove2 = map2Hashes[sharedHash];
+    LOG_VART(toRemove2);
+    const ElementType elementType = toRemove1.getType();
+    LOG_VART(elementType);
+    assert(elementType == toRemove2.getType());
+
+    if (elementType == ElementType::Way && _favorMoreConnectedWays)
+    {
+      const int numConnectedTo1 = WayUtils::getNumberOfConnectedWays(toRemove1.getId(), map1);
+      LOG_VART(numConnectedTo1);
+      const int numConnectedTo2 = WayUtils::getNumberOfConnectedWays(toRemove2.getId(), map2);
+      LOG_VART(numConnectedTo2);
+      if (numConnectedTo1 >= numConnectedTo2)
+      {
+        elementIdsToRemoveFromMap[toRemove1] = "2";
+        elementsToRemove[elementType.getEnum()].insert(toRemove2);
+      }
+      else
+      {
+        elementIdsToRemoveFromMap[toRemove2] = "1";
+        elementsToRemove[elementType.getEnum()].insert(toRemove1);
+      }
+    }
+    else
+    {
+      // see related note in _dupesToElementIdsCheckMap
+      elementsToRemove[elementType.getEnum()].insert(toRemove2);
+    }
+  }
+}
+
+}
Clone this wiki locally