Skip to content

v0.2.49..v0.2.50 changeset ScoreMatchesDiff.cpp

Garret Voltz edited this page Nov 6, 2019 · 1 revision
diff --git a/hoot-rnd/src/main/cpp/hoot/rnd/conflate/matching/ScoreMatchesDiff.cpp b/hoot-rnd/src/main/cpp/hoot/rnd/conflate/matching/ScoreMatchesDiff.cpp
new file mode 100644
index 0000000..d337bc5
--- /dev/null
+++ b/hoot-rnd/src/main/cpp/hoot/rnd/conflate/matching/ScoreMatchesDiff.cpp
@@ -0,0 +1,503 @@
+/*
+ * This file is part of Hootenanny.
+ *
+ * Hootenanny is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --------------------------------------------------------------------
+ *
+ * The following copyright notices are generated automatically. If you
+ * have a new notice to add, please use the format:
+ * " * @copyright Copyright ..."
+ * This will properly maintain the copyright information. DigitalGlobe
+ * copyrights will be updated automatically.
+ *
+ * @copyright Copyright (C) 2019 DigitalGlobe (http://www.digitalglobe.com/)
+ */
+#include "ScoreMatchesDiff.h"
+
+#include <hoot/core/util/Factory.h>
+#include <hoot/core/schema/MetadataTags.h>
+#include <hoot/core/io/IoUtils.h>
+#include <hoot/core/conflate/matching/MatchType.h>
+#include <hoot/core/visitors/ElementIdToTagValueMapper.h>
+#include <hoot/core/io/OsmXmlReader.h>
+#include <hoot/core/conflate/matching/MatchStatusChange.h>
+#include <hoot/core/visitors/UniqueElementIdVisitor.h>
+#include <hoot/core/util/CollectionUtils.h>
+#include <hoot/core/criterion/TagCriterion.h>
+#include <hoot/core/visitors/FilteredVisitor.h>
+#include <hoot/core/visitors/CountManualMatchesVisitor.h>
+#include <hoot/core/visitors/CountUniqueReviewsVisitor.h>
+
+namespace hoot
+{
+
+ScoreMatchesDiff::ScoreMatchesDiff() :
+_numManualMatches(0),
+_reviewDifferential(0)
+{
+}
+
+ScoreMatchesDiff::~ScoreMatchesDiff()
+{
+  clear();
+}
+
+void ScoreMatchesDiff::clear()
+{
+  _newlyWrong.clear();
+  _newlyCorrect.clear();
+  _newlyWrongMatchSwitches.clear();
+  _newlyCorrectMatchSwitches.clear();
+  _elementIdsAdded.clear();
+  _elementIdsRemoved.clear();
+
+  _input1 = "";
+  _input2 = "";
+  _output = "";
+  if (_outputFile)
+  {
+    _outputFile->close();
+  }
+  _outputFile.reset();
+
+  _numManualMatches = 0;
+}
+
+void ScoreMatchesDiff::calculateDiff(const QString& input1, const QString& input2)
+{
+  if (!OsmXmlReader().isSupported(input1))
+  {
+    throw IllegalArgumentException(
+      "Unsupported input format: " + input1 + " Must be an .osm file.");
+  }
+  else if (!OsmXmlReader().isSupported(input2))
+  {
+    throw IllegalArgumentException(
+      "Unsupported input format: " + input2 + " Must be an .osm file.");
+  }
+
+  _input1 = input1;
+  _input2 = input2;
+  LOG_INFO(
+    "Calculating match scoring differential for " << _input1.right(25) << " and " <<
+    _input2.right(25) << "...");
+
+  // Considered making this not memory bound, but our scoring data outputs aren't ever humongous
+  // and the inputs had to be conflated at some point where they were read completely into
+  // memory anyway. There's a lot of repeated element processing here that could be optimized.
+
+  OsmMapPtr map1(new OsmMap());
+  IoUtils::loadMap(map1, _input1, true);
+  map1->setName("original");
+  OsmMapPtr map2(new OsmMap());
+  IoUtils::loadMap(map2, _input2, true);
+  map2->setName("new");
+
+  LOG_VARD(map1->size());
+  LOG_VARD(map2->size());
+
+  // count the manual matches made
+
+  std::shared_ptr<CountManualMatchesVisitor> manualMatchVisitor(
+    new CountManualMatchesVisitor());
+  map1->visitRo(*manualMatchVisitor);
+  const int numManualMatches1 = manualMatchVisitor->getStat();
+  manualMatchVisitor.reset(new CountManualMatchesVisitor());
+  map2->visitRo(*manualMatchVisitor);
+  const int numManualMatches2 = manualMatchVisitor->getStat();
+
+  LOG_VARD(numManualMatches1);
+  LOG_VARD(numManualMatches2);
+
+  if (numManualMatches1 != numManualMatches2)
+  {
+    throw HootException(
+      QString("The two input datasets have a different number of manual matches and, therefore, ") +
+      QString("must not been derived from the same input data."));
+  }
+  _numManualMatches = numManualMatches1;
+
+  // get all expected/actual match types
+
+  const QMap<ElementId, QString> expected1 =
+    _getMatchStatuses(map1, MetadataTags::HootExpected());
+  const QMap<ElementId, QString> actual1 =
+    _getMatchStatuses(map1, MetadataTags::HootActual());
+  const QMap<ElementId, QString> expected2 =
+    _getMatchStatuses(map2, MetadataTags::HootExpected());
+  const QMap<ElementId, QString> actual2 =
+    _getMatchStatuses(map2, MetadataTags::HootActual());
+
+  LOG_VARD(expected1.size());
+  LOG_VARD(actual1.size());
+  assert(expected1.size() == actual1.size());
+  LOG_VARD(expected2.size());
+  LOG_VARD(actual2.size());
+  assert(expected2.size() == actual2.size());
+
+  // get newly wrong and corrected matches
+
+  const QSet<ElementId> wrong1 = _getWrong(map1);
+  const QSet<ElementId> wrong2 = _getWrong(map2);
+  _newlyWrong = wrong2;
+  _newlyWrong.subtract(wrong1);
+  _newlyCorrect = wrong1;
+  _newlyCorrect.subtract(wrong2);
+
+  LOG_VARD(wrong1.size());
+  LOG_VARD(wrong2.size());
+  LOG_VARD(_newlyWrong.size());
+  LOG_VARD(_newlyCorrect.size());
+
+  // for newly wrong/correct, compare actual to expected to group match changes by type
+  // TODO: is sending in expected2/actual2 right here?
+
+  _newlyWrongMatchSwitches = _getMatchScoringDiff(_newlyWrong, expected2, actual2);
+  _newlyCorrectMatchSwitches = _getMatchScoringDiff(_newlyCorrect, expected2, actual2);
+
+  LOG_VARD(_newlyWrongMatchSwitches.size());
+  for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyWrongMatchSwitches.begin();
+       itr != _newlyWrongMatchSwitches.end(); ++itr)
+  {
+    LOG_DEBUG(itr.key() << " " << itr.value().size());
+  }
+  LOG_VARD(_newlyCorrectMatchSwitches.size());
+  for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyCorrectMatchSwitches.begin();
+       itr != _newlyCorrectMatchSwitches.end(); ++itr)
+  {
+    LOG_DEBUG(itr.key() << " " << itr.value().size());
+  }
+
+  // get a list of all element ids
+
+  const QSet<ElementId> all1Ids = _getAllIds(map1);
+  const QSet<ElementId> all2Ids = _getAllIds(map2);
+
+  LOG_VARD(all1Ids.size());
+  LOG_VARD(all2Ids.size());
+
+  // determine any ids that are new or have been removed
+
+  _setAddedAndRemovedElements(all1Ids, all2Ids, _elementIdsAdded, _elementIdsRemoved);
+
+  LOG_VARD(_elementIdsAdded.size());
+  LOG_VARD(_elementIdsRemoved.size());
+
+  // count the reviews
+
+  CountUniqueReviewsVisitor countReviewsVis;
+  map1->visitRo(countReviewsVis);
+  const int numReviews1 = (int)countReviewsVis.getStat();
+  countReviewsVis.clear();
+  map2->visitRo(countReviewsVis);
+  const int numReviews2 = (int)countReviewsVis.getStat();
+  _reviewDifferential = numReviews2 - numReviews1;
+
+  LOG_VARD(numReviews1);
+  LOG_VARD(numReviews2);
+  LOG_VARD(_reviewDifferential);
+
+  LOG_INFO(
+    "Match scoring differential calculated for " <<
+    StringUtils::formatLargeNumber(all1Ids.size() + all2Ids.size()) << " total elements.");
+}
+
+bool ScoreMatchesDiff::printDiff(const QString& output)
+{
+  _output = output;
+  if (_newlyWrongMatchSwitches.isEmpty() && _newlyCorrectMatchSwitches.isEmpty() &&
+      _elementIdsAdded.isEmpty() && _elementIdsRemoved.isEmpty())
+  {
+    LOG_WARN(
+      "There are no differences in match scoring status between the two input files. Did you " <<
+      "calculate a differential? Are the input files identical?");
+    return false;
+  }
+
+  LOG_INFO(
+    "Writing match scoring differential for " << _input1.right(25) << " and " <<
+     _input2.right(25) << " to " << _output.right(25) << "...");
+
+  _outputFile = _getOutputFile(_output);
+  QTextStream out(_outputFile.get());
+
+  out << "Input files: " << _input1.right(25) << " and " << _input2.right(25) << "\n\n";
+  _writeConflateStatusSummary(out);
+  _writeConflateStatusDetail(out);
+
+  return true;
+}
+
+std::shared_ptr<QFile> ScoreMatchesDiff::_getOutputFile(const QString& outputPath)
+{
+  std::shared_ptr<QFile> outputFile(new QFile(outputPath));
+  if (outputFile->exists())
+  {
+    outputFile->remove();
+  }
+  if (!outputFile->open(QFile::WriteOnly | QFile::Text))
+  {
+    throw HootException("Unable to write to output file.");
+  }
+  return outputFile;
+}
+
+QSet<ElementId> ScoreMatchesDiff::_getAllIds(const ConstOsmMapPtr& map)
+{
+  LOG_DEBUG("Retrieving all IDs for " << map->getName() << "...");
+  UniqueElementIdVisitor idVis;
+  map->visitRo(idVis);
+  return CollectionUtils::stdSetToQSet(idVis.getElementSet());
+}
+
+QMap<ElementId, QString> ScoreMatchesDiff::_getMatchStatuses(
+  const ConstOsmMapPtr& map, const QString& tagKey)
+{
+  LOG_DEBUG("Retrieving match status: " << tagKey << " for " << map->getName() << "...");
+  ElementIdToTagValueMapper vis;
+  vis.setTagKey(tagKey);
+  map->visitRo(vis);
+  return vis.getIdToTagValueMappings();
+}
+
+QSet<ElementId> ScoreMatchesDiff::_getWrong(const ConstOsmMapPtr& map)
+{
+  LOG_DEBUG("Retrieving wrongly match elements for " << map->getName() << "...");
+  UniqueElementIdVisitor idVis;
+  TagCriterion crit(MetadataTags::HootWrong(), "1");
+  FilteredVisitor filteredVis(crit, idVis);
+  map->visitRo(filteredVis);
+  return CollectionUtils::stdSetToQSet(idVis.getElementSet());
+}
+
+QMap<QString, QSet<ElementId>> ScoreMatchesDiff::_getMatchScoringDiff(
+  const QSet<ElementId>& elementIds, const QMap<ElementId, QString>& expectedTagMappings,
+  const QMap<ElementId, QString>& actualTagMappings)
+{
+  LOG_DEBUG("Calculating match scoring diff...");
+  QMap<QString, QSet<ElementId>> matchSwitches;
+  for (QSet<ElementId>::const_iterator itr = elementIds.begin(); itr != elementIds.end(); ++itr)
+  {
+    const ElementId id = *itr;
+    if (!expectedTagMappings.contains(id))
+    {
+      throw HootException("Can't find ID: " + id.toString() + " for expected.");
+    }
+    QString expectedMatchTypeStr = expectedTagMappings[id];
+    MatchType expectedMatchType = MatchType(expectedMatchTypeStr);
+    if (!actualTagMappings.contains(id))
+    {
+      throw HootException("Can't find ID: " + id.toString() + " for actual.");
+    }
+    QString actualMatchTypeStr = actualTagMappings[id];
+    MatchType actualMatchType = MatchType(actualMatchTypeStr);
+
+    LOG_VART(expectedMatchTypeStr);
+    LOG_VART(actualMatchTypeStr);
+
+    if (expectedMatchType == MatchType::Match && actualMatchType == MatchType::Miss)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::MatchToMiss).toString()].insert(id);
+    }
+    else if (expectedMatchType == MatchType::Match && actualMatchType == MatchType::Review)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::MatchToReview).toString()].insert(id);
+    }
+    else if (expectedMatchType == MatchType::Miss && actualMatchType == MatchType::Match)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::MissToMatch).toString()].insert(id);
+    }
+    else if (expectedMatchType == MatchType::Miss && actualMatchType == MatchType::Review)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::MissToReview).toString()].insert(id);
+    }
+    else if (expectedMatchType == MatchType::Review && actualMatchType == MatchType::Match)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::ReviewToMatch).toString()].insert(id);
+    }
+    else if (expectedMatchType == MatchType::Review && actualMatchType == MatchType::Miss)
+    {
+      matchSwitches[MatchStatusChange(MatchStatusChange::ReviewToMiss).toString()].insert(id);
+    }
+    else
+    {
+      throw IllegalArgumentException("TODO");
+    }
+  }
+  return matchSwitches;
+}
+
+void ScoreMatchesDiff::_setAddedAndRemovedElements(
+  const QSet<ElementId>& all1Ids, const QSet<ElementId>& all2Ids,
+  QSet<ElementId>& elementIdsAdded, QSet<ElementId>& elementIdsRemoved)
+{
+  LOG_DEBUG("Recording added/removed elements...");
+  QSet<ElementId> allIds = all1Ids;
+  allIds = allIds.unite(all2Ids);
+  elementIdsAdded = allIds;
+  elementIdsAdded.subtract(all1Ids);
+  elementIdsRemoved = allIds;
+  elementIdsRemoved.subtract(all2Ids);
+}
+
+void ScoreMatchesDiff::_writeConflateStatusSummary(QTextStream& out)
+{
+  LOG_DEBUG("Writing conflate status summary...");
+
+  QString summary;
+  summary += "Match Scoring Differential Summary:\n\n";
+  summary +=
+    StringUtils::formatLargeNumber(_numManualMatches) + " manual matches.\n";
+  summary +=
+    StringUtils::formatLargeNumber(_elementIdsRemoved.size()) +
+    " elements from the first file were removed in the second file.\n";
+  summary +=
+    StringUtils::formatLargeNumber(_elementIdsAdded.size()) +
+    " new elements were added to the second file.\n";
+  summary +=
+    StringUtils::formatLargeNumber(_newlyWrong.size()) +
+    " new elements are involved in wrong matches.\n";
+  summary +=
+    StringUtils::formatLargeNumber(_newlyCorrect.size()) +
+    " new elements are involved in correct matches.\n";
+  for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyWrongMatchSwitches.begin();
+       itr != _newlyWrongMatchSwitches.end(); ++itr)
+  {
+    const QString changeType = itr.key();
+    const int numChanged = itr.value().size();
+    summary +=
+      StringUtils::formatLargeNumber(numChanged) + " new wrong matches switched from " +
+      changeType + ".\n";
+  }
+  for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyCorrectMatchSwitches.begin();
+       itr != _newlyCorrectMatchSwitches.end(); ++itr)
+  {
+    const QString changeType = itr.key();
+    const int numChanged = itr.value().size();
+    summary +=
+      StringUtils::formatLargeNumber(numChanged) + " new correct matches switched from " +
+      changeType + ".\n";
+  }
+  if (_reviewDifferential > 0)
+  {
+    summary += QString::number(_reviewDifferential) + " reviews were added.\n";
+  }
+  else if (_reviewDifferential == 0)
+  {
+    summary += "No reviews were added.\n";
+  }
+  else if (_reviewDifferential < 0)
+  {
+    summary += QString::number(abs(_reviewDifferential)) + " reviews were lost.\n";
+  }
+
+  std::cout << summary << std::endl;
+  std::cout << "Detailed information available in: " << _output << "." << std::endl;
+
+  out << summary;
+}
+
+void ScoreMatchesDiff::_writeConflateStatusDetail(QTextStream& out)
+{
+  LOG_DEBUG("Printing conflate status detail...");
+
+  out << "\nMatch Type Changes\n";
+  if (_newlyWrongMatchSwitches.size() + _newlyCorrectMatchSwitches.size() == 0)
+  {
+    out << "none";
+  }
+  else
+  {
+    out << "\nNew Wrong Matches:\n\n";
+    if (_newlyWrongMatchSwitches.size() == 0)
+    {
+      out << "none";
+    }
+    else
+    {
+      for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyWrongMatchSwitches.begin();
+           itr != _newlyWrongMatchSwitches.end(); ++itr)
+      {
+        const QString matchSwitchTypeStr = itr.key();
+        out << "\n" << matchSwitchTypeStr << "\n\n";
+        QList<ElementId> ids = itr.value().toList();
+        qSort(ids);
+        for (QList<ElementId>::const_iterator itr2 = ids.begin(); itr2 != ids.end(); ++itr2)
+        {
+          const ElementId id = *itr2;
+          out << id.toString() << "\n";
+        }
+      }
+    }
+
+    out << "\nNew Correct Matches:\n\n";
+    if (_newlyCorrectMatchSwitches.size() == 0)
+    {
+      out << "none";
+    }
+    else
+    {
+      for (QMap<QString, QSet<ElementId>>::const_iterator itr = _newlyCorrectMatchSwitches.begin();
+           itr != _newlyCorrectMatchSwitches.end(); ++itr)
+      {
+        const QString matchSwitchTypeStr = itr.key();
+        out << "\n" << matchSwitchTypeStr << "\n\n";
+        QList<ElementId> ids = itr.value().toList();
+        qSort(ids);
+        for (QList<ElementId>::const_iterator itr2 = ids.begin(); itr2 != ids.end(); ++itr2)
+        {
+          const ElementId id = *itr2;
+          out << id.toString() << "\n";
+        }
+      }
+    }
+  }
+
+  out << "\n\nAdded Elements:\n\n";
+  if (_elementIdsAdded.size() == 0)
+  {
+    out << "none";
+  }
+  else
+  {
+    QList<ElementId> elementIdsAdded = _elementIdsAdded.toList();
+    qSort(elementIdsAdded);
+    for (QList<ElementId>::const_iterator itr = elementIdsAdded.begin();
+         itr != elementIdsAdded.end(); ++itr)
+    {
+      const ElementId id = *itr;
+      out << id.toString() << "\n";
+    }
+  }
+
+  out << "\nRemoved Elements:\n\n";
+  if (_elementIdsRemoved.size() == 0)
+  {
+     out << "none";
+  }
+  else
+  {
+    QList<ElementId> elementIdsRemoved = _elementIdsRemoved.toList();
+    qSort(elementIdsRemoved);
+    for (QList<ElementId>::const_iterator itr = elementIdsRemoved.begin();
+         itr != elementIdsRemoved.end(); ++itr)
+    {
+      const ElementId id = *itr;
+      out << id.toString() << "\n";
+    }
+  }
+}
+
+}
Clone this wiki locally