Update hit term extractor to use scores when considering hit terms

documentation pt1 format documentation pt2 format set retry to 0 and add log for testing wordsandscores test testing format fix documentation and add new method more tests, fix edge case, documentation test, log, tostring more methods, more tests add getArrSize pass skips on end of cq squash me squash me working except for excerptTest fix test squash me fix before/after method not cq upgrade retry change start offset logic quicker fail and retry make scores output more user readable
NationalSecurityAgency · May 8, 2024 · f25999b · f25999b
1 parent 1869c2a
commit f25999b
Show file tree

Hide file tree

Showing 8 changed files with 977 additions and 95 deletions.
diff --git a/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java b/.../query-core/src/main/java/datawave/query/iterator/logic/TermFrequencyExcerptIterator.java
@@ -5,7 +5,6 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
-import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.SortedSet;
@@ -66,6 +65,18 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
     // the top value
     protected Value tv;
 
+    private float origHalfSize;
+    // the list of hit terms
+    protected ArrayList<String> hitTermsList;
+    // the direction for the excerpt
+    private String direction;
+
+    private boolean trim;
+
+    private static final String BEFORE = "BEFORE";
+    private static final String AFTER = "AFTER";
+    private static final String BOTH = "BOTH";
+
     @Override
     public IteratorOptions describeOptions() {
         IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
@@ -135,6 +146,10 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
         this.startOffset = Integer.parseInt(options.get(START_OFFSET));
         this.endOffset = Integer.parseInt(options.get(END_OFFSET));
         this.fieldName = options.get(FIELD_NAME);
+        hitTermsList = new ArrayList<>();
+        direction = BOTH;
+        origHalfSize = 0;
+        trim = false;
     }
 
     @Override
@@ -264,7 +279,9 @@ public void next() throws IOException {
         Text cv = top.getColumnVisibility();
         long ts = top.getTimestamp();
         Text row = top.getRow();
-        List<String>[] terms = new List[endOffset - startOffset];
+        // set the size of the array to the amount of terms we need to choose
+        WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
+        boolean stopFound;
 
         // while we have term frequencies for the same document
         while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
@@ -278,6 +295,8 @@ public void next() throws IOException {
                 try {
                     // parse the offsets from the value
                     TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
+                    // check if the number of scores is equal to the number of offsets
+                    boolean useScores = info.getScoreCount() == info.getTermOffsetCount();
 
                     // for each offset, gather all the terms in our range
                     for (int i = 0; i < info.getTermOffsetCount(); i++) {
@@ -286,12 +305,25 @@ public void next() throws IOException {
                         if (offset >= startOffset && offset < endOffset) {
                             // calculate the index in our value list
                             int index = offset - startOffset;
-                            // if the value is larger than the value for this offset thus far
+                            // if the current index has no words/scores yet, initialize an object at the index
                             if (terms[index] == null) {
-                                terms[index] = new ArrayList<>();
+                                terms[index] = new WordsAndScores();
+                            }
+                            // if we are using scores, add the word and score to the object, if not then only add the word
+                            if (useScores) {
+                                if (info.getScore(i) < 0) {
+                                    stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
+                                } else {
+                                    stopFound = terms[index].addTerm(fieldAndValue[1], info.getScore(i), hitTermsList);
+                                }
+                            } else {
+                                stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
+                            }
+                            if (stopFound && !trim) {
+                                tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
+                                tv = new Value();
+                                return;
                             }
-                            // use this value
-                            terms[index].add(fieldAndValue[1]);
                         }
                     }
                 } catch (InvalidProtocolBufferException e) {
@@ -302,7 +334,6 @@ public void next() throws IOException {
             // get the next term frequency
             source.next();
         }
-
         // generate the return key and value
         tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
         tv = new Value();
@@ -315,28 +346,132 @@ public void next() throws IOException {
      *            the terms to create a phrase from
      * @return the phrase
      */
-    protected String generatePhrase(List<String>[] terms) {
-        String[] largestTerms = new String[terms.length];
+    protected String generatePhrase(WordsAndScores[] terms) {
+        // create an array with the same length as the one we just passed in
+        String[] termsToOutput = new String[terms.length];
+        boolean bef = direction.equals(BEFORE);
+        boolean aft = direction.equals(AFTER);
+        boolean lock = false;
+        int beforeIndex = -1;
+        int afterIndex = -1;
         for (int i = 0; i < terms.length; i++) {
-            largestTerms[i] = getLongestTerm(terms[i]);
+            // if there is nothing at this position, put nothing in the position in the new one
+            if (terms[i] == null) {
+                termsToOutput[i] = null;
+            } else {
+                String outputWord = terms[i].getWordToOutput();
+                // if the term to put in this position is in the stop list, put nothing in its place
+                if (outputWord == null) {
+                    termsToOutput[i] = null;
+                } else {
+                    // if the term to output is valid, put it in the same position in the new array
+                    termsToOutput[i] = outputWord;
+                    if ((bef || aft) && (terms[i].getHasHitTerm())) {
+                        if (!lock) {
+                            afterIndex = i;
+                            lock = true;
+                        }
+                        beforeIndex = i;
+                    }
+                }
+            }
+        }
+        if (!lock) {
+            if (!trim) {
+                // join everything together with spaces while skipping null offsets
+                return joiner.join(termsToOutput);
+            } else {
+                // join everything together with spaces while skipping null offsets (after trimming both sides down to what we need)
+                return joiner.join(bothTrim(termsToOutput));
+            }
+        } else {
+            if (bef) { // if direction is "before", set everything after the last hit term to null
+                for (int k = beforeIndex + 1; k < terms.length; k++) {
+                    termsToOutput[k] = null;
+                }
+                if (trim) {
+                    float start = beforeIndex - (origHalfSize * 2);
+                    boolean startNull = false;
+                    for (int k = beforeIndex; k >= 0; k--) {
+                        if (!startNull && k < start) {
+                            startNull = true;
+                        }
+                        if (startNull) {
+                            termsToOutput[k] = null;
+                        } else {
+                            if (termsToOutput[k] == null) {
+                                start--;
+                            }
+                        }
+                    }
+                }
+                // join everything together with spaces while skipping null offsets
+                return joiner.join(termsToOutput);
+            } else if (aft) { // if direction is "after", set everything before the first hit term to null
+                for (int k = 0; k < afterIndex; k++) {
+                    termsToOutput[k] = null;
+                }
+                if (trim) {
+                    float start = afterIndex + (origHalfSize * 2);
+                    boolean startNull = false;
+                    for (int k = afterIndex; k < terms.length; k++) {
+                        if (!startNull && k > start) {
+                            startNull = true;
+                        }
+                        if (startNull) {
+                            termsToOutput[k] = null;
+                        } else {
+                            if (termsToOutput[k] == null) {
+                                start++;
+                            }
+                        }
+                    }
+                }
+                // join everything together with spaces while skipping null offsets
+                return joiner.join(termsToOutput);
+            } else {
+                if (!trim) {
+                    // join everything together with spaces while skipping null offsets
+                    return joiner.join(termsToOutput);
+                } else {
+                    // join everything together with spaces while skipping null offsets (after trimming both sides down to what we need)
+                    return joiner.join(bothTrim(termsToOutput));
+                }
+            }
         }
-
-        return joiner.join(largestTerms);
     }
 
-    /**
-     * Get the longest term from a list of terms;
-     *
-     * @param terms
-     *            the terms to create a phrase
-     * @return the longest term (null if empty or null list)
-     */
-    protected String getLongestTerm(List<String> terms) {
-        if (terms == null || terms.isEmpty()) {
-            return null;
-        } else {
-            return terms.stream().max(Comparator.comparingInt(String::length)).get();
+    private String[] bothTrim(String[] termsToOutput) {
+        int expandedMid = (endOffset - startOffset) / 2;
+        int start = (int) (expandedMid - origHalfSize);
+        boolean startNull = false;
+        for (int k = expandedMid; k >= 0; k--) {
+            if (!startNull && k < start) {
+                startNull = true;
+            }
+            if (startNull) {
+                termsToOutput[k] = null;
+            } else {
+                if (termsToOutput[k] == null) {
+                    start--;
+                }
+            }
         }
+        start = (int) (expandedMid + origHalfSize);
+        startNull = false;
+        for (int k = expandedMid; k < termsToOutput.length; k++) {
+            if (!startNull && k > start) {
+                startNull = true;
+            }
+            if (startNull) {
+                termsToOutput[k] = null;
+            } else {
+                if (termsToOutput[k] == null) {
+                    start++;
+                }
+            }
+        }
+        return termsToOutput;
     }
 
     /**
@@ -471,17 +606,26 @@ private String getDtUid(String str) {
         }
     }
 
+    public void setHitTermsList(List<String> hitTermsList) {
+        this.hitTermsList = (ArrayList<String>) hitTermsList;
+    }
+
+    public void setDirection(String direction) {
+        this.direction = direction;
+    }
+
+    public void setOrigHalfSize(float origHalfSize) {
+        this.origHalfSize = origHalfSize;
+    }
+
+    public void setTrim(boolean trim) {
+        this.trim = trim;
+    }
+
     @Override
     public String toString() {
-        StringBuilder sb = new StringBuilder();
-        sb.append("TermFrequencyExcerptIterator: ");
-        sb.append(this.fieldName);
-        sb.append(", ");
-        sb.append(this.startOffset);
-        sb.append(", ");
-        sb.append(this.endOffset);
-
-        return sb.toString();
+
+        return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
     }
 
 }