v0.2.48..v0.2.49 changeset ImplicitTagRulesDatabaseDeriver.cpp
Garret Voltz edited this page Oct 2, 2019
·
1 revision
diff --git a/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRulesDatabaseDeriver.cpp b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRulesDatabaseDeriver.cpp
new file mode 100644
index 0000000..f8786ff
--- /dev/null
+++ b/hoot-core/src/main/cpp/hoot/core/schema/ImplicitTagRulesDatabaseDeriver.cpp
@@ -0,0 +1,468 @@
+/*
+ * This file is part of Hootenanny.
+ *
+ * Hootenanny is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * --------------------------------------------------------------------
+ *
+ * The following copyright notices are generated automatically. If you
+ * have a new notice to add, please use the format:
+ * " * @copyright Copyright ..."
+ * This will properly maintain the copyright information. DigitalGlobe
+ * copyrights will be updated automatically.
+ *
+ * @copyright Copyright (C) 2017, 2018, 2019 DigitalGlobe (http://www.digitalglobe.com/)
+ */
+#include "ImplicitTagRulesDatabaseDeriver.h"
+
+// hoot
+#include <hoot/core/elements/Tags.h>
+#include <hoot/core/util/StringUtils.h>
+#include <hoot/core/util/ConfigOptions.h>
+#include <hoot/core/util/HootException.h>
+#include <hoot/core/schema/OsmSchema.h>
+#include <hoot/core/algorithms/string/StringTokenizer.h>
+#include <hoot/core/io/ImplicitTagRulesSqliteWriter.h>
+
+// Qt
+#include <QStringBuilder>
+
+namespace hoot
+{
+
+ImplicitTagRulesDatabaseDeriver::ImplicitTagRulesDatabaseDeriver() :
+_statusUpdateInterval(ConfigOptions().getTaskStatusUpdateInterval()),
+_minTagOccurrencesPerWord(1),
+_minWordLength(1),
+_useSchemaTagValuesForWordsOnly(true)
+{
+}
+
+void ImplicitTagRulesDatabaseDeriver::setConfiguration(const Settings& conf)
+{
+ const ConfigOptions confOptions(conf);
+
+ setMinTagOccurrencesPerWord(
+ confOptions.getImplicitTaggingDatabaseDeriverMinimumTagOccurrencesPerWord());
+ setMinWordLength(confOptions.getImplicitTaggingDatabaseDeriverMinimumWordLength());
+ setUseSchemaTagValuesForWordsOnly(
+ confOptions.getImplicitTaggingDatabaseDeriverUseSchemaTagValuesForWordsOnly());
+
+ setCustomRuleFile(confOptions.getImplicitTaggingDatabaseDeriverCustomRuleFile());
+ setTagIgnoreFile(confOptions.getImplicitTaggingDatabaseDeriverTagIgnoreFile());
+ setWordIgnoreFile(confOptions.getImplicitTaggingDatabaseDeriverWordIgnoreFile());
+}
+
+void ImplicitTagRulesDatabaseDeriver::deriveRulesDatabase(const QString& input, const QString& output)
+{
+ _validateInputs(input, output);
+
+ LOG_INFO(
+ "Deriving implicit tag rules for input: " << input << ". Writing to output: " <<
+ output << "...");
+
+ LOG_VARD(_minTagOccurrencesPerWord);
+ LOG_VARD(_minWordLength);
+ LOG_VARD(_customRules.getWordIgnoreFile());
+ LOG_VARD(_customRules.getTagIgnoreFile());
+ LOG_VARD(_customRules.getCustomRuleFile());
+ LOG_VARD(_useSchemaTagValuesForWordsOnly);
+
+ _customRules.init();
+
+ LOG_VARD(_customRules.getWordIgnoreList().size());
+ LOG_VARD(_customRules.getWordIgnoreList());
+ LOG_VARD(_customRules.getTagIgnoreList().size());
+ LOG_VARD(_customRules.getTagIgnoreList());
+ LOG_VARD(_customRules.getCustomRulesList().size());
+ LOG_VARD(_customRules.getCustomRulesList());
+
+ if (_minTagOccurrencesPerWord == 1 && _minWordLength == 1 &&
+ _customRules.getWordIgnoreList().size() == 0 && _customRules.getTagIgnoreList().size() == 0 &&
+ _customRules.getCustomRulesList().size() == 0 && !_useSchemaTagValuesForWordsOnly)
+ {
+ LOG_INFO("Skipping filtering, as no filtering criteria were specified...");
+ if (_minTagOccurrencesPerWord >= 2)
+ {
+ _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord);
+ _writeRules(_thresholdedCountFile->fileName(), output);
+ }
+ else
+ {
+ LOG_INFO("Skipping count thresholding since threshold = 1...");
+ _writeRules(input, output);
+ }
+ }
+ else
+ {
+ if (_useSchemaTagValuesForWordsOnly)
+ {
+ _populateSchemaTagValues();
+ }
+
+ if (_minTagOccurrencesPerWord >= 2)
+ {
+ _removeKvpsBelowOccurrenceThreshold(input, _minTagOccurrencesPerWord);
+ _applyFiltering(_thresholdedCountFile->fileName());
+ }
+ else
+ {
+ LOG_INFO("Skipping count thresholding since min occurrence threshold = 1...");
+ _applyFiltering(input);
+ }
+
+ _writeRules(_filteredCountFile->fileName(), output);
+ }
+}
+
+void ImplicitTagRulesDatabaseDeriver::_writeRules(const QString& input, const QString& output)
+{
+ ImplicitTagRulesSqliteWriter ruleWordPartWriter;
+ ruleWordPartWriter.open(output);
+ ruleWordPartWriter.write(input);
+ ruleWordPartWriter.close();
+}
+
+void ImplicitTagRulesDatabaseDeriver::_removeKvpsBelowOccurrenceThreshold(const QString& input,
+ const int minOccurrencesThreshold)
+{
+ LOG_INFO(
+ "Removing tags below minimum occurrence threshold of: " +
+ QString::number(minOccurrencesThreshold) << "...");
+
+ _thresholdedCountFile.reset(
+ new QTemporaryFile(
+ ConfigOptions().getApidbBulkInserterTempFileDir() +
+ "/implicit-tag-rules-deriver-temp-XXXXXX"));
+ _thresholdedCountFile->setAutoRemove(!ConfigOptions().getImplicitTaggingKeepTempFiles());
+ if (!_thresholdedCountFile->open())
+ {
+ throw HootException(
+ QObject::tr("Error opening %1 for writing.").arg(_thresholdedCountFile->fileName()));
+ }
+ LOG_DEBUG("Opened thresholded temp file: " << _thresholdedCountFile->fileName());
+ if (ConfigOptions().getImplicitTaggingKeepTempFiles())
+ {
+ LOG_WARN("Keeping temp file: " << _thresholdedCountFile->fileName());
+ }
+ if (!QFile(input).exists())
+ {
+ throw HootException("Unable to clean file; file doesn't exist.");
+ }
+
+ //This removes lines with occurrence counts below the specified threshold; not exactly sure why
+ //one needs to be subtracted from the min occurrences here, though, for the result to be correct
+ const QString cmd =
+ "cat " + input + " | awk -v limit=" + QString::number(minOccurrencesThreshold - 1) +
+ " '$1 > limit{print}' > " + _thresholdedCountFile->fileName();
+ LOG_DEBUG(cmd);
+ if (std::system(cmd.toStdString().c_str()) != 0)
+ {
+ throw HootException("Unable to clean input file.");
+ }
+ _thresholdedCountFile->close();
+}
+
+bool ImplicitTagRulesDatabaseDeriver::_wordIsNotASchemaTagValue(const QString& word)
+{
+ //If _useSchemaTagValuesForWordsOnly is activated, the word is not on the ignore list, and any
+ //token in the name matches a OSM tag value parsed from the hoot schema, then the whole word is
+ //eligible for implicit tag extraction.
+
+ StringTokenizer tokenizer;
+ bool wordNotASchemaTagValue = false;
+ if (_useSchemaTagValuesForWordsOnly && !word.trimmed().isEmpty() &&
+ !_customRules.getWordIgnoreList().contains(word, Qt::CaseInsensitive))
+ {
+ wordNotASchemaTagValue = true;
+ const QStringList tokenizedWords = tokenizer.tokenize(word.toLower());
+ for (int i = 0; i < tokenizedWords.size(); i++)
+ {
+ const QString tokenizedWord = tokenizedWords.at(i);
+ if (_schemaTagValues.contains(tokenizedWord) &&
+ !_customRules.getWordIgnoreList().contains(tokenizedWord, Qt::CaseInsensitive))
+ {
+ wordNotASchemaTagValue = false;
+ break;
+ }
+ }
+ }
+ return wordNotASchemaTagValue;
+}
+
+void ImplicitTagRulesDatabaseDeriver::_validateInputs(const QString& input, const QString& output)
+{
+ if (!input.endsWith(".implicitTagRules"))
+ {
+ throw IllegalArgumentException(
+ QString("A *.implicitTagRules file must be the input to implicit tag rules derivation. ") +
+ QString("Input specified: ") + input);
+ }
+
+ if (!output.endsWith(".sqlite"))
+ {
+ throw HootException(
+ "Incorrect output specified: " + output + ". Must be a .sqlite database file.");
+ }
+}
+
+void ImplicitTagRulesDatabaseDeriver::_populateSchemaTagValues()
+{
+ _schemaTagValues.clear();
+ _wordsNotInSchema.clear();
+
+ // TODO: should the use and/or building categories be added here?
+ const std::vector<SchemaVertex> tags =
+ OsmSchema::getInstance().getTagByCategory(OsmSchemaCategory::poi());
+ StringTokenizer tokenizer;
+ for (std::vector<SchemaVertex>::const_iterator tagItr = tags.begin();
+ tagItr != tags.end(); ++tagItr)
+ {
+ SchemaVertex tag = *tagItr;
+ const QString tagVal = tag.value.toLower().replace("_", " ");
+ if (!tagVal.contains("*")) //skip wildcards
+ {
+ if (!_customRules.getWordIgnoreList().contains(tagVal, Qt::CaseInsensitive))
+ {
+ _schemaTagValues.insert(tagVal);
+ //dealing with the uk english spellings on an as seen basis; this should be expanded and
+ //made more extensible
+ if (tagVal == "theatre")
+ {
+ _schemaTagValues.insert("theater");
+ }
+ if (tagVal == "centre")
+ {
+ _schemaTagValues.insert("center");
+ }
+ LOG_TRACE("Appended " << tagVal << " to schema tag values.");
+ }
+ QStringList vals = tokenizer.tokenize(tagVal);
+ for (int i = 0; i < vals.size(); i++)
+ {
+ const QString val = vals.at(i);
+ if (!_customRules.getWordIgnoreList().contains(val, Qt::CaseInsensitive))
+ {
+ _schemaTagValues.insert(val);
+ //see comment above
+ if (val == "theatre")
+ {
+ _schemaTagValues.insert("theater");
+ }
+ if (val == "centre")
+ {
+ _schemaTagValues.insert("center");
+ }
+ LOG_TRACE("Appended " << val << " to schema tag values.");
+ }
+ }
+ }
+ }
+ LOG_VARD(_schemaTagValues.size());
+ QStringList schemaTagValuesList = _schemaTagValues.toList();
+ qSort(schemaTagValuesList.begin(), schemaTagValuesList.end()); //sort for viewing only
+ LOG_VART(schemaTagValuesList);
+}
+
+void ImplicitTagRulesDatabaseDeriver::_applyFiltering(const QString& input)
+{
+ LOG_INFO("Applying word/tag/rule filtering to output...");
+
+ _filteredCountFile.reset(
+ new QTemporaryFile(
+ ConfigOptions().getApidbBulkInserterTempFileDir() +
+ "/implicit-tag-rules-deriver-temp-XXXXXX"));
+ _filteredCountFile->setAutoRemove(!ConfigOptions().getImplicitTaggingKeepTempFiles());
+ if (!_filteredCountFile->open())
+ {
+ throw HootException(
+ QObject::tr("Error opening %1 for writing.").arg(_filteredCountFile->fileName()));
+ }
+ LOG_DEBUG("Opened filtered temp file: " << _filteredCountFile->fileName());
+ if (ConfigOptions().getImplicitTaggingKeepTempFiles())
+ {
+ LOG_WARN("Keeping temp file: " << _filteredCountFile->fileName());
+ }
+ QFile inputFile(input);
+ if (!inputFile.open(QIODevice::ReadOnly))
+ {
+ throw HootException(QObject::tr("Error opening %1 for reading.").arg(input));
+ }
+ LOG_DEBUG("Opened input file: " << input);
+
+ long linesParsedCount = 0;
+ long linesWrittenCount = 0;
+ long wordsTooSmallCount = 0;
+ long ignoredWordsCount = 0;
+ long ignoredTagsCount = 0;
+ long ignoredRuleCountDueToCustomRules = 0;
+ long wordNotASchemaValueCount = 0;
+
+ while (!inputFile.atEnd())
+ {
+ const QString line = QString::fromUtf8(inputFile.readLine().constData()).trimmed();
+ LOG_VART(line);
+ const QStringList lineParts = line.split("\t");
+ LOG_VART(lineParts);
+ QString word = lineParts[1].trimmed();
+ LOG_VART(word);
+
+ //this won't come back true unless _useSchemaTagValuesForWordsOnly = true.
+ const bool wordNotASchemaTagValue = _wordIsNotASchemaTagValue(word);
+
+ const bool wordTooSmall = word.length() < _minWordLength;
+
+ //Skip the word if we already have a custom rule that is associated with it (they're applied
+ //to the database after this filtering).
+ if (!wordTooSmall && !_customRules.getWordIgnoreList().contains(word, Qt::CaseInsensitive) &&
+ !wordNotASchemaTagValue)
+ {
+ const QString kvp = lineParts[2].trimmed();
+ LOG_VART(kvp);
+ const QString tagKey = kvp.split("=")[0];
+ LOG_VART(tagKey);
+ const QString keyWildCard = tagKey % "=*";
+ LOG_VART(keyWildCard);
+
+ const QStringList tagIgnoreList = _customRules.getTagIgnoreList();
+ const bool ignoreTag =
+ !tagIgnoreList.isEmpty() &&
+ (tagIgnoreList.contains(kvp) || tagIgnoreList.contains(keyWildCard));
+ LOG_VART(ignoreTag);
+
+ if (!ignoreTag)
+ {
+ const QString customRuleTag = _customRules.getCustomRulesList().value(word.toLower(), "");
+ if (customRuleTag == kvp)
+ {
+ LOG_TRACE(
+ "Skipping word/tag combo on custom rule list. Word: " << word << ", tag: " <<
+ kvp << ".");
+ ignoredRuleCountDueToCustomRules++;
+ }
+ else
+ {
+ //write the valid count line
+ const long count = lineParts[0].trimmed().toLong();
+ LOG_VART(count);
+ const QString line = QString::number(count) % "\t" % word % "\t" % kvp % "\n";
+ LOG_VART(line);
+ _filteredCountFile->write(line.toUtf8());
+ linesWrittenCount++;
+ }
+ }
+ else
+ {
+ if (ignoreTag)
+ {
+ LOG_TRACE("Skipping tag on the ignore list: " << kvp << ".");
+ }
+ else
+ {
+ LOG_TRACE("Skipping tag not on the include list: " << kvp << ".");
+ }
+ ignoredTagsCount++;
+ }
+ }
+ else
+ {
+ if (wordTooSmall)
+ {
+ LOG_TRACE(
+ "Skipping word: " << word <<
+ ", the length of which is less than the minimum allowed word length of: " <<
+ _minWordLength);
+ wordsTooSmallCount++;
+ }
+ else if (wordNotASchemaTagValue)
+ {
+ LOG_TRACE(
+ "Schema tag value requirement for word is being enforced and word is not a schema " <<
+ "tag value: " << word.toLower() << ".");
+ _wordsNotInSchema.insert(word.toLower());
+ wordNotASchemaValueCount++;
+ }
+ else
+ {
+ LOG_TRACE("Skipping word on the ignore list: " << word << ".");
+ ignoredWordsCount++;
+ }
+ }
+
+ linesParsedCount++;
+ if (linesParsedCount % (_statusUpdateInterval * 100) == 0)
+ {
+ PROGRESS_INFO(
+ "Filtered " << StringUtils::formatLargeNumber(linesParsedCount) <<
+ " count file lines from input.");
+ }
+ }
+ inputFile.close();
+
+ LOG_INFO("Parsed " << StringUtils::formatLargeNumber(linesParsedCount) << " words.");
+ LOG_INFO(
+ "Skipped " << StringUtils::formatLargeNumber(wordsTooSmallCount) <<
+ " words that were too small.");
+ LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredWordsCount) << " words.");
+ LOG_INFO("Ignored " << StringUtils::formatLargeNumber(ignoredTagsCount) << " tags.");
+ LOG_INFO(
+ "Ignored " << StringUtils::formatLargeNumber(ignoredRuleCountDueToCustomRules) <<
+ " rules due to them overlapping with the custom rules list.");
+ LOG_INFO(
+ "Skipped " << StringUtils::formatLargeNumber(wordNotASchemaValueCount) <<
+ " words that were not a schema value.");
+ if (wordNotASchemaValueCount > 0)
+ {
+ QStringList wordsNotInSchemaList = _wordsNotInSchema.toList();
+ qSort(wordsNotInSchemaList.begin(), wordsNotInSchemaList.end());
+ LOG_VART(wordsNotInSchemaList);
+ }
+
+ //technically this could be done outside of this filtering...
+ _writeCustomRules(linesWrittenCount);
+
+ LOG_INFO(
+ "Wrote " << StringUtils::formatLargeNumber(linesWrittenCount) << " / " <<
+ StringUtils::formatLargeNumber(linesParsedCount) << " lines to filtered file.");
+
+ _filteredCountFile->close();
+}
+
+void ImplicitTagRulesDatabaseDeriver::_writeCustomRules(long& linesWrittenCount)
+{
+ // would like to know somehow if any of the custom rules overlap with the db derived
+ // rules from the public data - #2300
+
+ LOG_DEBUG("Writing custom rules...");
+ long ruleCount = 0;
+ LOG_VARD(_customRules.getCustomRulesList().size());
+ if (_customRules.getCustomRulesList().size() > 0)
+ {
+ const QMap<QString, QString> customRulesList = _customRules.getCustomRulesList();
+ for (QMap<QString, QString>::const_iterator customRulesItr = customRulesList.begin();
+ customRulesItr != customRulesList.end(); ++customRulesItr)
+ {
+ const QString line =
+ QString::number(INT_MAX) % "\t" % customRulesItr.key().trimmed() % "\t" %
+ customRulesItr.value().trimmed() % "\n";
+ LOG_VART(line);
+ _filteredCountFile->write(line.toUtf8());
+ ruleCount++;
+ linesWrittenCount++;
+ }
+ LOG_INFO("Wrote " << ruleCount << " custom rules.");
+ }
+}
+
+}