Update hit term extractor to use scores when considering hit terms

documentation pt1 format documentation pt2 format set retry to 0 and add log for testing wordsandscores test testing format fix documentation and add new method more tests, fix edge case, documentation test, log, tostring more methods, more tests add getArrSize pass skips on end of cq squash me squash me working except for excerptTest fix test squash me fix before/after
NationalSecurityAgency · Apr 9, 2024 · 216f3c5 · 216f3c5
1 parent 6730346
commit 216f3c5
Show file tree

Hide file tree

Showing 8 changed files with 824 additions and 105 deletions.
diff --git a/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java b/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java
@@ -5,7 +5,6 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedSet;
@@ -66,6 +65,19 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
     // the top value
     protected Value tv;
 
+    // how many words skipped in the lower half of the excerpt
+    private int leftSkippedWords;
+    // how many words skipped in the upper half of the excerpt
+    private int rightSkippedWords;
+    // the list of hit terms
+    protected ArrayList<String> hitTermsList;
+    // the direction for the excerpt
+    private String direction;
+
+    private static final String BEFORE = "BEFORE";
+    private static final String AFTER = "AFTER";
+    private static final String BOTH = "BOTH";
+
     @Override
     public IteratorOptions describeOptions() {
         IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
@@ -135,6 +147,10 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
         this.startOffset = Integer.parseInt(options.get(START_OFFSET));
         this.endOffset = Integer.parseInt(options.get(END_OFFSET));
         this.fieldName = options.get(FIELD_NAME);
+        leftSkippedWords = 0;
+        rightSkippedWords = 0;
+        hitTermsList = new ArrayList<>();
+        direction = BOTH;
     }
 
     @Override
@@ -264,7 +280,8 @@ public void next() throws IOException {
         Text cv = top.getColumnVisibility();
         long ts = top.getTimestamp();
         Text row = top.getRow();
-        List<String>[] terms = new List[endOffset - startOffset];
+        // set the size of the array to the amount of terms we need to choose
+        WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
 
         // while we have term frequencies for the same document
         while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
@@ -278,6 +295,8 @@ public void next() throws IOException {
                 try {
                     // parse the offsets from the value
                     TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
+                    // check if the number of scores is equal to the number of offsets
+                    boolean useScores = info.getScoreCount() == info.getTermOffsetCount();
 
                     // for each offset, gather all the terms in our range
                     for (int i = 0; i < info.getTermOffsetCount(); i++) {
@@ -286,12 +305,20 @@ public void next() throws IOException {
                         if (offset >= startOffset && offset < endOffset) {
                             // calculate the index in our value list
                             int index = offset - startOffset;
-                            // if the value is larger than the value for this offset thus far
+                            // if the current index has no words/scores yet, initialize an object at the index
                             if (terms[index] == null) {
-                                terms[index] = new ArrayList<>();
+                                terms[index] = new WordsAndScores();
+                            }
+                            // if we are using scores, add the word and score to the object, if not then only add the word
+                            if (useScores) {
+                                if (info.getScore(i) < 0) {
+                                    terms[index].addTerm(fieldAndValue[1], hitTermsList);
+                                } else {
+                                    terms[index].addTerm(fieldAndValue[1], info.getScore(i), hitTermsList);
+                                }
+                            } else {
+                                terms[index].addTerm(fieldAndValue[1], hitTermsList);
                             }
-                            // use this value
-                            terms[index].add(fieldAndValue[1]);
                         }
                     }
                 } catch (InvalidProtocolBufferException e) {
@@ -302,9 +329,10 @@ public void next() throws IOException {
             // get the next term frequency
             source.next();
         }
-
         // generate the return key and value
-        tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
+        tk = new Key(row, new Text(dtUid),
+                        new Text(fieldName + Constants.NULL + generatePhrase(terms) + Constants.NULL + leftSkippedWords + Constants.NULL + rightSkippedWords),
+                        cv, ts);
         tv = new Value();
     }
 
@@ -315,28 +343,51 @@ public void next() throws IOException {
      *            the terms to create a phrase from
      * @return the phrase
      */
-    protected String generatePhrase(List<String>[] terms) {
-        String[] largestTerms = new String[terms.length];
+    protected String generatePhrase(WordsAndScores[] terms) {
+        // create an array with the same length as the one we just passed in
+        String[] termsToOutput = new String[terms.length];
+        boolean bef = direction.equals(BEFORE);
+        boolean aft = direction.equals(AFTER);
+        boolean lock = false;
+        int beforeIndex = -1;
+        int afterIndex = -1;
         for (int i = 0; i < terms.length; i++) {
-            largestTerms[i] = getLongestTerm(terms[i]);
+            // if there is nothing at this position, put nothing in the position in the new one
+            if (terms[i] == null) {
+                termsToOutput[i] = null;
+            } else {
+                // if the term to put in this position is in the stop list, increment the corresponding *skippedWords and put nothing in this position
+                if (terms[i].getWordToOutput() == null) {
+                    if (i <= (terms.length / 2)) {
+                        leftSkippedWords++;
+                    } else {
+                        rightSkippedWords++;
+                    }
+                    termsToOutput[i] = null;
+                } else {
+                    // if the term to output is valid, put it in the same position in the new array
+                    termsToOutput[i] = terms[i].getWordToOutput();
+                    if ((bef || aft) && (terms[i].getHasHitTerm())) {
+                        if (!lock) {
+                            afterIndex = i;
+                            lock = true;
+                        }
+                        beforeIndex = i;
+                    }
+                }
+            }
         }
-
-        return joiner.join(largestTerms);
-    }
-
-    /**
-     * Get the longest term from a list of terms;
-     *
-     * @param terms
-     *            the terms to create a phrase
-     * @return the longest term (null if empty or null list)
-     */
-    protected String getLongestTerm(List<String> terms) {
-        if (terms == null || terms.isEmpty()) {
-            return null;
-        } else {
-            return terms.stream().max(Comparator.comparingInt(String::length)).get();
+        if (bef) { // if direction is "before", set everything after the last hit term to null
+            for (int k = beforeIndex + 1; k < terms.length; k++) {
+                termsToOutput[k] = null;
+            }
+        } else if (aft) { // if direction is "after", set everything before the first hit term to null
+            for (int k = 0; k < afterIndex; k++) {
+                termsToOutput[k] = null;
+            }
         }
+        // join everything together with spaces while skipping null offsets
+        return joiner.join(termsToOutput);
     }
 
     /**
@@ -471,17 +522,18 @@ private String getDtUid(String str) {
         }
     }
 
+    public void setHitTermsList(List<String> hitTermsList) {
+        this.hitTermsList = (ArrayList<String>) hitTermsList;
+    }
+
+    public void setDirection(String direction) {
+        this.direction = direction;
+    }
+
     @Override
     public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("TermFrequencyExcerptIterator: ");
-        sb.append(this.fieldName);
-        sb.append(", ");
-        sb.append(this.startOffset);
-        sb.append(", ");
-        sb.append(this.endOffset);
-
-        return sb.toString();
+
+        return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
     }
 
 }