Update hit term extractor to use scores when considering hit terms

documentation pt1 format documentation pt2 format set retry to 0 and add log for testing wordsandscores test testing format fix documentation and add new method more tests, fix edge case, documentation test, log, tostring more methods, more tests add getArrSize pass skips on end of cq
NationalSecurityAgency · Mar 26, 2024 · b00394a · b00394a
1 parent ea77d49
commit b00394a
Show file tree

Hide file tree

Showing 7 changed files with 518 additions and 73 deletions.
diff --git a/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java b/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java
@@ -2,11 +2,8 @@
 
 import java.io.IOException;
 import java.nio.charset.CharacterCodingException;
-import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
 import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
@@ -66,6 +63,11 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
     // the top value
     protected Value tv;
 
+    // how many words skipped in the lower half of the excerpt
+    private int leftSkippedWords;
+    // how many words skipped in the upper half of the excerpt
+    private int rightSkippedWords;
+
     @Override
     public IteratorOptions describeOptions() {
         IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
@@ -135,6 +137,8 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
         this.startOffset = Integer.parseInt(options.get(START_OFFSET));
         this.endOffset = Integer.parseInt(options.get(END_OFFSET));
         this.fieldName = options.get(FIELD_NAME);
+        leftSkippedWords = 0;
+        rightSkippedWords = 0;
     }
 
     @Override
@@ -264,7 +268,8 @@ public void next() throws IOException {
         Text cv = top.getColumnVisibility();
         long ts = top.getTimestamp();
         Text row = top.getRow();
-        List<String>[] terms = new List[endOffset - startOffset];
+        // set the size of the array to the amount of terms we need to choose
+        WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
 
         // while we have term frequencies for the same document
         while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
@@ -278,6 +283,8 @@ public void next() throws IOException {
                 try {
                     // parse the offsets from the value
                     TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
+                    // check if the number of scores is equal to the number of offsets
+                    boolean useScores = info.getScoreCount() == info.getTermOffsetCount();
 
                     // for each offset, gather all the terms in our range
                     for (int i = 0; i < info.getTermOffsetCount(); i++) {
@@ -286,12 +293,16 @@ public void next() throws IOException {
                         if (offset >= startOffset && offset < endOffset) {
                             // calculate the index in our value list
                             int index = offset - startOffset;
-                            // if the value is larger than the value for this offset thus far
+                            // if the current index has no words/scores yet, initialize an object at the index
                             if (terms[index] == null) {
-                                terms[index] = new ArrayList<>();
+                                terms[index] = new WordsAndScores();
+                            }
+                            // if we are using scores, add the word and score to the object, if not then only add the word
+                            if (useScores) {
+                                terms[index].addTerm(fieldAndValue[1], info.getScore(i));
+                            } else {
+                                terms[index].addTerm(fieldAndValue[1]);
                             }
-                            // use this value
-                            terms[index].add(fieldAndValue[1]);
                         }
                     }
                 } catch (InvalidProtocolBufferException e) {
@@ -302,9 +313,10 @@ public void next() throws IOException {
             // get the next term frequency
             source.next();
         }
-
         // generate the return key and value
-        tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
+        tk = new Key(row, new Text(dtUid),
+                        new Text(fieldName + Constants.NULL + generatePhrase(terms) + Constants.NULL + leftSkippedWords + Constants.NULL + rightSkippedWords),
+                        cv, ts);
         tv = new Value();
     }
 
@@ -315,28 +327,29 @@ public void next() throws IOException {
      *            the terms to create a phrase from
      * @return the phrase
      */
-    protected String generatePhrase(List<String>[] terms) {
-        String[] largestTerms = new String[terms.length];
+    protected String generatePhrase(WordsAndScores[] terms) {
+        // create an array with the same length as the one we just passed in
+        String[] termsToOutput = new String[terms.length];
         for (int i = 0; i < terms.length; i++) {
-            largestTerms[i] = getLongestTerm(terms[i]);
-        }
-
-        return joiner.join(largestTerms);
-    }
-
-    /**
-     * Get the longest term from a list of terms;
-     *
-     * @param terms
-     *            the terms to create a phrase
-     * @return the longest term (null if empty or null list)
-     */
-    protected String getLongestTerm(List<String> terms) {
-        if (terms == null || terms.isEmpty()) {
-            return null;
-        } else {
-            return terms.stream().max(Comparator.comparingInt(String::length)).get();
+            // if there is nothing at this position, put nothing in the position in the new one
+            if (terms[i] == null) {
+                termsToOutput[i] = null;
+            } else {
+                // if the term to put in this position is in the stop list, increment the corresponding *skippedWords and put nothing in this position
+                if (WordsAndScores.STOP_WORD_LIST.contains(terms[i].getWordToOutput())) {
+                    if (i <= (terms.length / 2)) {
+                        leftSkippedWords++;
+                    } else {
+                        rightSkippedWords++;
+                    }
+                    termsToOutput[i] = null;
+                } else {
+                    // if the term to output is valid, put it in the same position in the new array
+                    termsToOutput[i] = terms[i].getWordToOutput();
+                }
+            }
         }
+        return joiner.join(termsToOutput);
     }
 
     /**
@@ -473,15 +486,8 @@ private String getDtUid(String str) {
 
     @Override
     public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("TermFrequencyExcerptIterator: ");
-        sb.append(this.fieldName);
-        sb.append(", ");
-        sb.append(this.startOffset);
-        sb.append(", ");
-        sb.append(this.endOffset);
-
-        return sb.toString();
+
+        return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
     }
 
 }
diff --git a/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java b/warehouse/query-core/src/main/java/datawave/query/iterator/logic/WordsAndScores.java
@@ -0,0 +1,214 @@
+package datawave.query.iterator.logic;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.log4j.Logger;
+
+/**
+ * An object used to save terms and their respective scores (if a score exists and is valid).
+ */
+public class WordsAndScores {
+    private static final Logger log = Logger.getLogger(WordsAndScores.class);
+
+    /** the arraylist of words */
+    private ArrayList<String> words;
+    /** the arraylist of scores */
+    private ArrayList<Integer> scores;
+
+    /** the index of the word with the longest length */
+    private int longestWordIndex;
+    /** the index of the word with the smallest score (smallest non-negative is best) */
+    private int smallestScoreIndex;
+    /** the size of the words and scores arrays */
+    private int arrSize;
+
+    private boolean useScores;
+
+    /**
+     * the list of stop words (words to skip when outputting the excerpt).
+     * <p>
+     * </p>
+     * used by TermFrequencyExcerptIterator.generatePhrase
+     */
+    public static final Set<String> STOP_WORD_LIST = Set.of("<eps>");
+
+    public WordsAndScores() {
+        words = new ArrayList<>();
+        scores = new ArrayList<>();
+        longestWordIndex = -1;
+        smallestScoreIndex = -1;
+        arrSize = 0;
+        useScores = false;
+    }
+
+    public void reset() {
+        words.clear();
+        scores.clear();
+        longestWordIndex = -1;
+        smallestScoreIndex = -1;
+        arrSize = 0;
+        useScores = false;
+    }
+
+    /**
+     * A method to add a word and score into the object and saves its index if it is the longest word or has the smallest (non-negative) score.
+     *
+     * @param word
+     *            the word to add
+     * @param score
+     *            the score to add
+     */
+    public void addTerm(String word, int score) {
+        arrSize++; // increment the size of the arrays
+        words.add(word); // add the word to its array
+        scores.add(score); // add the score to its array
+        if (arrSize > 1) { // if this is at least the second word/score we are adding...
+            if (word.length() > words.get(longestWordIndex).length()) { // if this word is longer than the current longest word
+                longestWordIndex = arrSize - 1; // set this index as the longest word
+            }
+            if (smallestScoreIndex == -1) { // if we do not already have a valid smallestScore...
+                if (score >= 0) { // if the passed in score is valid...
+                    smallestScoreIndex = arrSize - 1; // set this index as the smallest score
+                    useScores = true;
+                }
+            } else { // if we have a current smallestScore...
+                if (score >= 0 && score <= scores.get(smallestScoreIndex)) { // if this score is non-negative and smaller than or equal to the current smallest
+                                                                             // score...
+                    if (score == scores.get(smallestScoreIndex)) { // if this score is equal to the smallest score (just in case this happens for some
+                                                                   // reason)...
+                        log.info("Two tokens have the same score: Choosing the longest one.");
+                        if (word.length() > words.get(smallestScoreIndex).length()) { // if this word is longer than the word with the smallest score...
+                            smallestScoreIndex = arrSize - 1; // set this index as the smallest score
+                        }
+                    } else { // if this score is smaller than the current smallest score...
+                        smallestScoreIndex = arrSize - 1; // set this index as the smallest score
+                    }
+                }
+            }
+        } else { // if this is the first word/score we are adding, set index 0 (the only one) as the longest/smallest
+            longestWordIndex = 0;
+            if (score >= 0) {
+                smallestScoreIndex = 0;
+                useScores = true;
+            }
+        }
+    }
+
+    /**
+     * A method to add a word into the object and saves its index if it is the longest word.
+     *
+     * @param word
+     *            the word to add
+     */
+    public void addTerm(String word) {
+        arrSize++; // increment array size
+        words.add(word); // add the word to its array
+        scores.add(-1); // add the default score to its array
+        if (arrSize > 1) { // if this is atleast the second word we are adding...
+            if (word.length() > words.get(longestWordIndex).length()) { // if this word is longer than the current longest word...
+                longestWordIndex = arrSize - 1; // set this index as the longest word
+            }
+        } else { // if this is the first word we are adding set the longest word index to 0 (the only one)
+            longestWordIndex = 0;
+        }
+    }
+
+    /**
+     * A method to return whichever word we are going to use in the excerpt for this position.
+     *
+     * @return the word with the greatest score if the score is valid, if not, return the longest word
+     */
+    public String getWordToOutput() {
+        if (smallestScoreIndex == -1 && longestWordIndex == -1) { // if we try and get the word from an object with nothing added to it (should never happen)...
+            log.warn("Trying to get token to output when none have been added: Will output \"reportmetodatawave\".");
+            return "reportmetodatawave";
+        } else {
+            if (useScores) { // if we have added at least one score, and it is a valid score...
+                return words.get(smallestScoreIndex); // return the word with the smallest score
+            } else { // default to returning the longest word if the scores don't exist/aren't valid
+                return words.get(longestWordIndex); // return the longest word
+            }
+        }
+    }
+
+    /**
+     * Returns the list of words saved in this object.
+     *
+     * @return words
+     */
+    public List<String> getWordsList() {
+        return words;
+    }
+
+    /**
+     * Sets the list of words to the one passed in and sets the list of scores to -1.
+     *
+     * @param words
+     *            a list of words
+     */
+    public void setWordsList(List<String> words) {
+        reset();
+        for (String word : words) {
+            addTerm(word);
+        }
+    }
+
+    /**
+     * Returns the list of scores saved in this object.
+     *
+     * @return scores
+     */
+    public List<Integer> getScoresList() {
+        return scores;
+    }
+
+    /**
+     * Sets the list of scores to the one passed in.
+     *
+     * @param scores
+     *            a list of scores
+     */
+    public void setScoresList(List<Integer> scores) {
+        this.scores = (ArrayList<Integer>) scores;
+    }
+
+    /**
+     * Sets the list of words and the list of scores to the ones passed in.
+     *
+     * @param words
+     *            a list of words
+     * @param scores
+     *            a list of scores
+     */
+    public void setWordsAndScoresList(List<String> words, List<Integer> scores) {
+        if (words.size() != scores.size()) {
+            throw new IllegalArgumentException("The words and scores lists must be the same size!");
+        } else {
+            reset();
+            for (int i = 0; i < words.size(); i++) {
+                addTerm(words.get(i), scores.get(i));
+            }
+        }
+    }
+
+    /**
+     * Returns a boolean that is true if there is a valid score in the scores list and false otherwise.
+     *
+     * @return useScores
+     */
+    public boolean getUseScores() {
+        return useScores;
+    }
+
+    public int getArrSize() {
+        return arrSize;
+    }
+
+    @Override
+    public String toString() {
+        return "WordsAndScores{" + "words=" + words + ", scores=" + scores + ", longestWordIndex=" + longestWordIndex + ", smallestScoreIndex="
+                        + smallestScoreIndex + ", arrSize=" + arrSize + '}';
+    }
+}