Skip to content

v0.2.48..v0.2.49 changeset ImplicitTagRawRulesDeriver.cpp

Garret Voltz edited this page Oct 2, 2019 · 1 revision
diff --git a/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.cpp b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.cpp
new file mode 100644
index 0000000..d5f1bf1
--- /dev/null
+++ b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRawRulesDeriver.cpp
@@ -0,0 +1,663 @@
+/*
+ * This file is part of Hootenanny.
+ *
+ * Hootenanny is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --------------------------------------------------------------------
+ *
+ * The following copyright notices are generated automatically. If you
+ * have a new notice to add, please use the format:
+ * " * @copyright Copyright ..."
+ * This will properly maintain the copyright information. DigitalGlobe
+ * copyrights will be updated automatically.
+ *
+ * @copyright Copyright (C) 2017, 2018, 2019 DigitalGlobe (http://www.digitalglobe.com/)
+ */
+#include "ImplicitTagRawRulesDeriver.h"
+
+// hoot
+#include <hoot/core/io/OsmMapReaderFactory.h>
+#include <hoot/core/io/PartialOsmMapReader.h>
+#include <hoot/core/schema/OsmSchema.h>
+#include <hoot/core/elements/Tags.h>
+#include <hoot/core/util/StringUtils.h>
+#include <hoot/core/util/ConfigOptions.h>
+#include <hoot/core/io/ElementVisitorInputStream.h>
+#include <hoot/core/visitors/SchemaTranslationVisitor.h>
+#include <hoot/core/util/FileUtils.h>
+#include <hoot/core/schema/ImplicitTagUtils.h>
+#include <hoot/core/util/Factory.h>
+
+// Qt
+#include <QStringBuilder>
+#include <QThread>
+
+namespace hoot
+{
+
+ImplicitTagRawRulesDeriver::ImplicitTagRawRulesDeriver() :
+_statusUpdateInterval(ConfigOptions().getTaskStatusUpdateInterval()),
+_countFileLineCtr(0),
+_sortParallelCount(QThread::idealThreadCount()),
+_skipFiltering(false),
+_keepTempFiles(false),
+_tempFileDir(ConfigOptions().getApidbBulkInserterTempFileDir()),
+_translateNamesToEnglish(true)
+{
+}
+
+void ImplicitTagRawRulesDeriver::setConfiguration(const Settings& conf)
+{
+  ConfigOptions options = ConfigOptions(conf);
+
+  setSortParallelCount(options.getImplicitTaggingRawRulesDeriverSortParallelCount());
+  const int idealThreads = QThread::idealThreadCount();
+  LOG_VART(idealThreads);
+  if (_sortParallelCount < 1 || _sortParallelCount > idealThreads)
+  {
+    setSortParallelCount(idealThreads);
+  }
+  setSkipFiltering(options.getImplicitTaggingRawRulesDeriverSkipFiltering());
+  setKeepTempFiles(options.getImplicitTaggingKeepTempFiles());
+  setTempFileDir(options.getApidbBulkInserterTempFileDir());
+  setTranslateNamesToEnglish(options.getImplicitTaggingDatabaseDeriverTranslateNamesToEnglish());
+  setElementCriterion(options.getImplicitTaggingElementCriterion());
+
+  if (_translateNamesToEnglish)
+  {
+    _translator.reset(
+      Factory::getInstance().constructObject<ToEnglishTranslator>(
+        options.getLanguageTranslationTranslator()));
+    _translator->setConfiguration(conf);
+    _translator->setSourceLanguages(options.getLanguageTranslationSourceLanguages());
+    _translator->setId("ImplicitTagRawRulesDeriver");
+  }
+}
+
+void ImplicitTagRawRulesDeriver::setElementCriterion(const QString& criterionName)
+{
+  ElementCriterion* criterion =
+    Factory::getInstance().constructObject<ElementCriterion>(criterionName);
+  if (dynamic_cast<ImplicitTagEligibleCriterion*>(criterion) != 0)
+  {
+    _elementCriterion.reset(dynamic_cast<ImplicitTagEligibleCriterion*>(criterion));
+  }
+  else
+  {
+    throw IllegalArgumentException("Invalid criterion type: " + criterionName);
+  }
+}
+
+void ImplicitTagRawRulesDeriver::_init()
+{
+  _wordKeysToCountsValues.clear();
+  _duplicatedWordTagKeyCountsToValues.clear();
+  _countFileLineCtr = 0;
+
+  _countFile.reset(
+    new QTemporaryFile(_tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
+  _countFile->setAutoRemove(!_keepTempFiles);
+  if (!_countFile->open())
+  {
+    throw HootException(QObject::tr("Error opening %1 for writing.").arg(_countFile->fileName()));
+  }
+  LOG_DEBUG("Opened temp file: " << _countFile->fileName());
+  if (_keepTempFiles)
+  {
+    LOG_WARN("Keeping temp file: " << _countFile->fileName());
+  }
+}
+
+void ImplicitTagRawRulesDeriver::deriveRawRules(const QStringList& inputs,
+                                                const QStringList& translationScripts,
+                                                const QString& output)
+{
+  _validateInputs(inputs, translationScripts, output);
+
+  LOG_INFO(
+    "Generating implicit tag rules raw file for inputs: " << inputs <<
+    ", translation scripts: " << translationScripts << ".  Writing to output: " << output << "...");
+  LOG_VARD(_sortParallelCount);
+  LOG_VARD(_skipFiltering);
+  LOG_VARD(_translateNamesToEnglish);
+
+  _init();
+
+  long eligibleFeatureCount = 0;
+  long totalFeatureCount = 0;
+  for (int i = 0; i < inputs.size(); i++)
+  {
+    std::shared_ptr<ElementInputStream> inputStream =
+      _getInputStream(inputs.at(i), translationScripts.at(i));
+    while (inputStream->hasMoreElements())
+    {
+      ElementPtr element = inputStream->readNextElement();
+      LOG_VART(element);
+
+      totalFeatureCount++;
+
+      assert(_elementCriterion.get());
+      if (_skipFiltering || _elementCriterion->isSatisfied(element))
+      {
+        QStringList names = element->getTags().getNames();
+        assert(!names.isEmpty());
+
+        //old_name/former_name generally indicates that an element formerly went by the name, so
+        //not really useful here.
+        if (names.removeAll("old_name") > 0)
+        {
+          LOG_VART("Removed old name tag.");
+        }
+        if (names.removeAll("former_name") > 0)
+        {
+          LOG_VART("Removed former name tag.");
+        }
+        assert(!names.isEmpty());
+
+        if (_translateNamesToEnglish)
+        {
+          names = ImplicitTagUtils::translateNamesToEnglish(names, element->getTags(), _translator);
+        }
+        LOG_VART(names);
+
+        //get back only the tags that we'd be interested in applying to future elements implicitly
+        //based on name
+        const QStringList kvps = _elementCriterion->getEligibleKvps(element->getTags());
+        assert(!kvps.isEmpty());
+        if (kvps.isEmpty())
+        {
+          throw HootException("Kvps empty.");
+        }
+
+        //parse whole names and token groups
+        _parseNames(names, kvps);
+
+        eligibleFeatureCount++;
+
+        if (eligibleFeatureCount % _statusUpdateInterval == 0)
+        {
+          PROGRESS_INFO(
+            "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
+            " eligible features / " << StringUtils::formatLargeNumber(totalFeatureCount) <<
+            " total features.");
+        }
+      }
+    }
+    _inputReader->finalizePartial();
+  }
+  _countFile->close();
+
+  LOG_INFO(
+    "Parsed " << StringUtils::formatLargeNumber(eligibleFeatureCount) <<
+    " eligible features from " << StringUtils::formatLargeNumber(totalFeatureCount) <<
+    " total features.");
+  LOG_INFO(
+    "Wrote " << StringUtils::formatLargeNumber(_countFileLineCtr) << " lines to count file.");
+
+  _sortByTagOccurrence();   //sort in descending count order
+  _removeDuplicatedKeyTypes();
+  bool tieCountsNeededResolved = false;
+  if (_duplicatedWordTagKeyCountsToValues.size() > 0)
+  {
+    _resolveCountTies();
+    tieCountsNeededResolved = true;
+  }
+  LOG_INFO(
+    "Extracted "  << StringUtils::formatLargeNumber(_wordKeysToCountsValues.size()) <<
+    " word/tag associations.");
+  LOG_INFO("Clearing word/tag associations...");
+  _wordKeysToCountsValues.clear();
+  if (tieCountsNeededResolved)
+  {
+    _sortByWord(_tieResolvedCountFile);
+  }
+  else
+  {
+    _sortByWord(_dedupedCountFile);
+  }
+}
+
+void ImplicitTagRawRulesDeriver::_validateInputs(const QStringList& inputs,
+                                                 const QStringList& translationScripts,
+                                                 const QString& output)
+{
+  LOG_VARD(inputs);
+  LOG_VARD(translationScripts);
+  LOG_VARD(output);
+
+  if (!_elementCriterion.get())
+  {
+    throw HootException("No element type was specified.");
+  }
+  if (inputs.isEmpty())
+  {
+    throw HootException("No inputs were specified.");
+  }
+  if (inputs.size() != translationScripts.size())
+  {
+    LOG_VARD(inputs.size());
+    LOG_VARD(translationScripts.size());
+    throw HootException(
+      "The size of the input datasets list must equal the size of the list of translation scripts.");
+  }
+  if (output.isEmpty())
+  {
+    throw HootException("No output was specified.");
+  }
+  _output.reset(new QFile());
+  _output->setFileName(output);
+  if (_output->exists() && !_output->remove())
+  {
+    throw HootException(QObject::tr("Error removing existing %1 for writing.").arg(output));
+  }
+  _output->close();
+
+  if (_translateNamesToEnglish && !_translator.get())
+  {
+    throw HootException("To English translation enabled but no translator was specified.");
+  }
+}
+
+void ImplicitTagRawRulesDeriver::_updateForNewWord(const QString& word, const QString& kvp)
+{
+  QString simpleWord = word.simplified();
+  LOG_TRACE("Updating word: " << simpleWord << " with kvp: " << kvp << "...");
+
+  ImplicitTagUtils::cleanName(simpleWord);
+
+  if (!simpleWord.isEmpty())
+  {
+    if (StringUtils::isNumber(simpleWord))
+    {
+      LOG_TRACE("Skipping word: " << simpleWord << ", which is a number.");
+      return;
+    }
+
+    if (!StringUtils::hasAlphabeticCharacter(simpleWord))
+    {
+      LOG_TRACE("Skipping word: " << simpleWord << ", which has no alphabetic characters.");
+      return;
+    }
+
+    const QString line = simpleWord.toLower() % QString("\t") % kvp % QString("\n");
+    _countFile->write(line.toUtf8());
+    _countFileLineCtr++;
+  }
+}
+
+std::shared_ptr<ElementInputStream> ImplicitTagRawRulesDeriver::_getInputStream(
+  const QString& input, const QString& translationScript)
+{
+  LOG_INFO("Parsing: " << input << "...");
+
+  _inputReader =
+    std::dynamic_pointer_cast<PartialOsmMapReader>(OsmMapReaderFactory::createReader(input));
+  _inputReader->open(input);
+  std::shared_ptr<ElementInputStream> inputStream =
+    std::dynamic_pointer_cast<ElementInputStream>(_inputReader);
+  LOG_VARD(translationScript);
+  //"none" allows for bypassing translation for an input; e.g. OSM data
+  if (translationScript.toLower() != "none")
+  {
+    std::shared_ptr<SchemaTranslationVisitor> translationVisitor(new SchemaTranslationVisitor());
+
+    // I think we always want to be going to OSM here unless otherwise specified (or maybe
+    // regardless if its specified), but that should be verified.
+    QString translationDirection =
+      conf().getString(ConfigOptions::getSchemaTranslationDirectionKey());
+    if (translationDirection.trimmed().isEmpty())
+    {
+      translationDirection = "toosm";
+    }
+    LOG_VARD(translationDirection);
+    translationVisitor->setTranslationDirection(translationDirection);
+
+    // always set the direction before setting the script
+    translationVisitor->setTranslationScript(translationScript);
+
+    inputStream.reset(new ElementVisitorInputStream(_inputReader, translationVisitor));
+  }
+  return inputStream;
+}
+
+void ImplicitTagRawRulesDeriver::_parseNames(const QStringList& names, const QStringList& kvps)
+{
+  for (int i = 0; i < names.size(); i++)
+  {
+    QString name = names.at(i);
+    LOG_VART(name);
+
+    //'=' is used in the map key for kvps, so it needs to be escaped in the word
+    if (name.contains("="))
+    {
+      name = name.replace("=", "%3D");
+    }
+    for (int j = 0; j < kvps.size(); j++)
+    {
+      _updateForNewWord(name, kvps.at(j));
+    }
+
+    QStringList nameTokens = _tokenizer.tokenize(name);
+    LOG_VART(nameTokens.size());
+
+    //tokenization
+
+    for (int j = 0; j < nameTokens.size(); j++)
+    {
+      QString nameToken = nameTokens.at(j);
+      _parseNameToken(nameToken, kvps);
+    }
+
+    //going up to a token group size of two; tested up to group size three, but three didn't seem to
+    //yield any better tagging results
+    if (nameTokens.size() > 2)
+    {
+      for (int j = 0; j < nameTokens.size() - 1; j++)
+      {
+        QString nameToken = nameTokens.at(j) + " " + nameTokens.at(j + 1);
+        _parseNameToken(nameToken, kvps);
+      }
+    }
+  }
+}
+
+void ImplicitTagRawRulesDeriver::_parseNameToken(QString& nameToken, const QStringList& kvps)
+{
+  //may eventually need to replace more punctuation chars here, but this is fine for now...need a
+  //more extensible way to do it; also, that logic could moved into ImplicitTagUtils::cleanName
+  nameToken = nameToken.replace(",", "");
+  LOG_VART(nameToken);
+
+  if (_translateNamesToEnglish)
+  {
+    const QString englishNameToken = _translator->translate(nameToken);
+    LOG_VART(englishNameToken);
+    if (!englishNameToken.isEmpty())
+    {
+      nameToken = englishNameToken;
+    }
+  }
+
+  for (int k = 0; k < kvps.size(); k++)
+  {
+    _updateForNewWord(nameToken, kvps.at(k));
+  }
+}
+
+void ImplicitTagRawRulesDeriver::_sortByTagOccurrence()
+{
+  LOG_INFO("Sorting output by tag occurrence count...");
+  LOG_VART(_sortParallelCount);
+
+  _sortedCountFile.reset(
+    new QTemporaryFile(_tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
+  _sortedCountFile->setAutoRemove(!_keepTempFiles);
+  if (!_sortedCountFile->open())
+  {
+    throw HootException(
+      QObject::tr("Error opening %1 for writing.").arg(_sortedCountFile->fileName()));
+  }
+  LOG_DEBUG("Opened sorted temp file: " << _sortedCountFile->fileName());
+  if (_keepTempFiles)
+  {
+    LOG_WARN("Keeping temp file: " << _sortedCountFile->fileName());
+  }
+  if (!_countFile->exists())
+  {
+    throw HootException("Unable to sort file; file doesn't exist.");
+  }
+
+  //This counts each unique line occurrence, sorts by decreasing occurrence count (necessary for
+  //next step which removes duplicate tag keys associated with the same word), and replaces the
+  //space between the prepended count and the word with a tab.
+
+  //sort by highest count, then by word, then by tag
+  const QString cmd =
+    "sort --parallel=" + QString::number(_sortParallelCount) + " " + _countFile->fileName() +
+    " | uniq -c | sort -n -r --parallel=" + QString::number(_sortParallelCount) + " | " +
+    "sed -e 's/^ *//;s/ /\\t/' > " + _sortedCountFile->fileName();
+  if (std::system(cmd.toStdString().c_str()) != 0)
+  {
+    throw HootException("Unable to sort file.");
+  }
+  LOG_INFO(
+    "Wrote " <<
+    StringUtils::formatLargeNumber(FileUtils::getNumberOfLinesInFile(_sortedCountFile->fileName())) <<
+    " lines to sorted file.");
+}
+
+void ImplicitTagRawRulesDeriver::_removeDuplicatedKeyTypes()
+{
+  LOG_INFO("Removing duplicated tag key types from output...");
+
+  //i.e. don't allow amenity=school AND amenity=shop to be associated with the same word...pick one
+  //of them
+
+  _dedupedCountFile.reset(
+    new QTemporaryFile(
+      _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
+  _dedupedCountFile->setAutoRemove(!_keepTempFiles);
+  if (!_dedupedCountFile->open())
+  {
+    throw HootException(
+      QObject::tr("Error opening %1 for writing.").arg(_dedupedCountFile->fileName()));
+  }
+  LOG_DEBUG("Opened dedupe temp file: " << _dedupedCountFile->fileName());
+  if (_keepTempFiles)
+  {
+    LOG_WARN("Keeping temp file: " << _dedupedCountFile->fileName());
+  }
+
+  long lineCount = 0;
+  long writtenLineCount = 0;
+  while (!_sortedCountFile->atEnd())
+  {
+    const QString line = QString::fromUtf8(_sortedCountFile->readLine().constData()).trimmed();
+    LOG_VART(line);
+    const QStringList lineParts = line.split("\t");
+    LOG_VART(lineParts);
+    QString word = lineParts[1].trimmed();
+    LOG_VART(word);
+    const QString kvp = lineParts[2].trimmed();
+    LOG_VART(kvp);
+    const QString countStr = lineParts[0].trimmed();
+    const long count = countStr.toLong();
+    LOG_VART(count);
+    const QStringList kvpParts = kvp.split("=");
+    const QString tagKey = kvpParts[0];
+    LOG_VART(tagKey);
+    const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed();
+    LOG_VART(wordTagKey);
+    const QString wordTagKeyCount =
+      word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed();
+    LOG_VART(wordTagKeyCount);
+    const QString tagValue = kvpParts[1];
+    LOG_VART(tagValue);
+
+    //The lines are sorted in reverse by occurrence count.  So the first time we see one word/key
+    //combo, we know it had the highest occurrence count, and we can ignore all subsequent
+    //instances of it since any one feature can't have more than one tag applied to it with the
+    //same key.
+
+    const QString queriedCountAndValue = _wordKeysToCountsValues.value(wordTagKey, "");
+    if (queriedCountAndValue.isEmpty())
+    {
+      _wordKeysToCountsValues[wordTagKey] = countStr % ";" % tagValue;
+      //this unescaping must occur during the final temp file write
+      if (word.contains("%3D"))
+      {
+        word = word.replace("%3D", "=");
+      }
+      else if (word.contains("%3d"))
+      {
+        word = word.replace("%3d", "=");
+      }
+      const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n";
+      LOG_VART(updatedLine);
+      _dedupedCountFile->write(updatedLine.toUtf8());
+      writtenLineCount++;
+    }
+    else
+    {
+      const long queriedCount = queriedCountAndValue.split(";")[0].toLong();
+      if (queriedCount == count)
+      {
+        LOG_TRACE(
+          "Recording duplicated word/tag key/count for: " << wordTagKeyCount << " with value: " <<
+          tagValue);
+        _duplicatedWordTagKeyCountsToValues[wordTagKeyCount].append(tagValue);
+      }
+    }
+
+    lineCount++;
+    if (lineCount % (_statusUpdateInterval * 10) == 0)
+    {
+      PROGRESS_INFO(
+        "Parsed " << StringUtils::formatLargeNumber(lineCount) <<
+        " lines from input for duplicated tag key removal.");
+    }
+  }
+  _sortedCountFile->close();
+  LOG_INFO(
+    "Wrote " << StringUtils::formatLargeNumber(writtenLineCount) << " lines to deduped file.");
+  _dedupedCountFile->close();
+}
+
+void ImplicitTagRawRulesDeriver::_resolveCountTies()
+{
+  //Any time more than one word/key combo has the same occurrence count, we need to pick just one
+  //of them.
+
+  LOG_INFO(
+    "Resolving word/tag key/count ties for " <<
+    StringUtils::formatLargeNumber(_duplicatedWordTagKeyCountsToValues.size()) <<
+    " duplicated word/tag key/counts...");
+
+  _tieResolvedCountFile.reset(
+    new QTemporaryFile(
+      _tempFileDir + "/implicit-tag-raw-rules-generator-temp-XXXXXX"));
+  _tieResolvedCountFile->setAutoRemove(!_keepTempFiles);
+  if (!_tieResolvedCountFile->open())
+  {
+    throw HootException(
+      QObject::tr("Error opening %1 for writing.").arg(_tieResolvedCountFile->fileName()));
+  }
+  LOG_DEBUG("Opened tie resolve temp file: " << _tieResolvedCountFile->fileName());
+  if (_keepTempFiles)
+  {
+    LOG_WARN("Keeping temp file: " << _tieResolvedCountFile->fileName());
+  }
+  if (!_dedupedCountFile->open())
+  {
+    throw HootException(
+      QObject::tr("Error opening %1 for reading.").arg(_dedupedCountFile->fileName()));
+  }
+
+  long lineCount = 0;
+  long duplicateResolutions = 0;
+  while (!_dedupedCountFile->atEnd())
+  {
+    const QString line = QString::fromUtf8(_dedupedCountFile->readLine().constData()).trimmed();
+    LOG_VART(line);
+    const QStringList lineParts = line.split("\t");
+    LOG_VART(lineParts);
+    QString word = lineParts[1].trimmed();
+    LOG_VART(word);
+    const QString kvp = lineParts[2].trimmed();
+    LOG_VART(kvp);
+    const QString countStr = lineParts[0].trimmed();
+    const long count = countStr.toLong();
+    LOG_VART(count);
+    const QStringList kvpParts = kvp.split("=");
+    const QString tagKey = kvpParts[0];
+    LOG_VART(tagKey);
+    const QString wordTagKey = word.trimmed() % ";" % tagKey.trimmed();
+    LOG_VART(wordTagKey);
+    const QString wordTagKeyCount =
+      word.trimmed() % ";" % tagKey.trimmed() % ";" % countStr.trimmed();
+    LOG_VART(wordTagKeyCount);
+    const QString tagValue = kvpParts[1];
+    LOG_VART(tagValue);
+
+    if (_duplicatedWordTagKeyCountsToValues.contains(wordTagKeyCount))
+    {
+      LOG_TRACE("Resolving duplicated word/tag key/count for " << wordTagKeyCount << "...");
+
+      //To resolve the tie, we're going to pick the most specific kvp.  e.g. amenity=public_hall
+      //wins out of amenity=hall.  This is not really dealing with same hierarchy level tags
+      //(e.g. amenity=school and amenity=hall) and will just arbitrarily pick in that situation.
+      //Duplicates do seem to be fairly rare, but there could be some perfomance gains by coming
+      //up with a better way to handle this situation.
+      QString lineWithMostSpecificKvp = line % "\n";
+      const QStringList tagValues = _duplicatedWordTagKeyCountsToValues[wordTagKeyCount];
+      for (int i = 0; i < tagValues.size(); i++)
+      {
+        const QString childKvp = tagKey % "=" % tagValues[i];
+        if (OsmSchema::getInstance().isAncestor(childKvp, tagKey % "=" % tagValue))
+        {
+          lineWithMostSpecificKvp = countStr % "\t" % word % "\t" % childKvp % "\n";
+        }
+      }
+      LOG_VART(lineWithMostSpecificKvp);
+      _tieResolvedCountFile->write(lineWithMostSpecificKvp.toUtf8());
+      duplicateResolutions++;
+    }
+    else
+    {
+      const QString updatedLine = countStr % "\t" % word % "\t" % kvp % "\n";
+      LOG_VART(updatedLine);
+      _tieResolvedCountFile->write(updatedLine.toUtf8());
+    }
+
+    lineCount++;
+    if (lineCount % (_statusUpdateInterval * 10) == 0)
+    {
+      PROGRESS_INFO(
+        "Parsed " << StringUtils::formatLargeNumber(lineCount) <<
+        " lines from input for duplicated tag key count ties.");
+    }
+  }
+  LOG_VARD(lineCount);
+  LOG_INFO(
+    "Resolved " << StringUtils::formatLargeNumber(duplicateResolutions) <<
+    " word/tag key/count ties.");
+  _duplicatedWordTagKeyCountsToValues.clear();
+  _tieResolvedCountFile->close();
+}
+
+void ImplicitTagRawRulesDeriver::_sortByWord(const std::shared_ptr<QTemporaryFile>& input)
+{
+  LOG_INFO("Sorting output by word...");
+  if (!input->exists())
+  {
+    throw HootException("Unable to sort file; file doesn't exist.");
+  }
+
+  //sort by word, then by tag
+  const QString cmd =
+    "sort -t$'\t' -k2,2 -k3,3 --parallel=" + QString::number(_sortParallelCount) + " " +
+     input->fileName() + " -o " + _output->fileName();
+  if (std::system(cmd.toStdString().c_str()) != 0)
+  {
+    throw HootException("Unable to sort input file.");
+  }
+
+  LOG_VARD(_output->fileName());
+  LOG_INFO(
+    "Wrote " <<
+    StringUtils::formatLargeNumber(
+      FileUtils::getNumberOfLinesInFile(_output->fileName())) << " lines to final sorted file.");
+}
+
+}
Clone this wiki locally