Skip to content

Commit

Permalink
Update hit term extractor to use scores when considering hit terms
Browse files Browse the repository at this point in the history
documentation pt1

format

documentation pt2

format

set retry to 0 and add log for testing

wordsandscores test

testing

format

fix documentation and add new method

more tests, fix edge case, documentation

test, log, tostring

more methods, more tests

add getArrSize

pass skips on end of cq

squash me

squash me

working except for excerptTest

fix test

squash me

fix before/after

method not cq

upgrade retry

change start offset logic

quicker fail and retry

make scores output more user readable
  • Loading branch information
austin007008 committed May 8, 2024
1 parent 1869c2a commit f25999b
Show file tree
Hide file tree
Showing 8 changed files with 977 additions and 95 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
Expand Down Expand Up @@ -66,6 +65,18 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
// the top value
protected Value tv;

private float origHalfSize;
// the list of hit terms
protected ArrayList<String> hitTermsList;
// the direction for the excerpt
private String direction;

private boolean trim;

private static final String BEFORE = "BEFORE";
private static final String AFTER = "AFTER";
private static final String BOTH = "BOTH";

@Override
public IteratorOptions describeOptions() {
IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
Expand Down Expand Up @@ -135,6 +146,10 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
this.startOffset = Integer.parseInt(options.get(START_OFFSET));
this.endOffset = Integer.parseInt(options.get(END_OFFSET));
this.fieldName = options.get(FIELD_NAME);
hitTermsList = new ArrayList<>();
direction = BOTH;
origHalfSize = 0;
trim = false;
}

@Override
Expand Down Expand Up @@ -264,7 +279,9 @@ public void next() throws IOException {
Text cv = top.getColumnVisibility();
long ts = top.getTimestamp();
Text row = top.getRow();
List<String>[] terms = new List[endOffset - startOffset];
// set the size of the array to the amount of terms we need to choose
WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];
boolean stopFound;

// while we have term frequencies for the same document
while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
Expand All @@ -278,6 +295,8 @@ public void next() throws IOException {
try {
// parse the offsets from the value
TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
// check if the number of scores is equal to the number of offsets
boolean useScores = info.getScoreCount() == info.getTermOffsetCount();

// for each offset, gather all the terms in our range
for (int i = 0; i < info.getTermOffsetCount(); i++) {
Expand All @@ -286,12 +305,25 @@ public void next() throws IOException {
if (offset >= startOffset && offset < endOffset) {
// calculate the index in our value list
int index = offset - startOffset;
// if the value is larger than the value for this offset thus far
// if the current index has no words/scores yet, initialize an object at the index
if (terms[index] == null) {
terms[index] = new ArrayList<>();
terms[index] = new WordsAndScores();
}
// if we are using scores, add the word and score to the object, if not then only add the word
if (useScores) {
if (info.getScore(i) < 0) {
stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
} else {
stopFound = terms[index].addTerm(fieldAndValue[1], info.getScore(i), hitTermsList);
}
} else {
stopFound = terms[index].addTerm(fieldAndValue[1], hitTermsList);
}
if (stopFound && !trim) {
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + "XXXWESKIPPEDAWORDXXX"), cv, ts);
tv = new Value();
return;
}
// use this value
terms[index].add(fieldAndValue[1]);
}
}
} catch (InvalidProtocolBufferException e) {
Expand All @@ -302,7 +334,6 @@ public void next() throws IOException {
// get the next term frequency
source.next();
}

// generate the return key and value
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
tv = new Value();
Expand All @@ -315,28 +346,132 @@ public void next() throws IOException {
* the terms to create a phrase from
* @return the phrase
*/
protected String generatePhrase(List<String>[] terms) {
String[] largestTerms = new String[terms.length];
protected String generatePhrase(WordsAndScores[] terms) {
// create an array with the same length as the one we just passed in
String[] termsToOutput = new String[terms.length];
boolean bef = direction.equals(BEFORE);
boolean aft = direction.equals(AFTER);
boolean lock = false;
int beforeIndex = -1;
int afterIndex = -1;
for (int i = 0; i < terms.length; i++) {
largestTerms[i] = getLongestTerm(terms[i]);
// if there is nothing at this position, put nothing in the position in the new one
if (terms[i] == null) {
termsToOutput[i] = null;
} else {
String outputWord = terms[i].getWordToOutput();
// if the term to put in this position is in the stop list, put nothing in its place
if (outputWord == null) {
termsToOutput[i] = null;
} else {
// if the term to output is valid, put it in the same position in the new array
termsToOutput[i] = outputWord;
if ((bef || aft) && (terms[i].getHasHitTerm())) {
if (!lock) {
afterIndex = i;
lock = true;
}
beforeIndex = i;
}
}
}
}
if (!lock) {
if (!trim) {
// join everything together with spaces while skipping null offsets
return joiner.join(termsToOutput);
} else {
// join everything together with spaces while skipping null offsets (after trimming both sides down to what we need)
return joiner.join(bothTrim(termsToOutput));
}
} else {
if (bef) { // if direction is "before", set everything after the last hit term to null
for (int k = beforeIndex + 1; k < terms.length; k++) {
termsToOutput[k] = null;
}
if (trim) {
float start = beforeIndex - (origHalfSize * 2);
boolean startNull = false;
for (int k = beforeIndex; k >= 0; k--) {
if (!startNull && k < start) {
startNull = true;
}
if (startNull) {
termsToOutput[k] = null;
} else {
if (termsToOutput[k] == null) {
start--;
}
}
}
}
// join everything together with spaces while skipping null offsets
return joiner.join(termsToOutput);
} else if (aft) { // if direction is "after", set everything before the first hit term to null
for (int k = 0; k < afterIndex; k++) {
termsToOutput[k] = null;
}
if (trim) {
float start = afterIndex + (origHalfSize * 2);
boolean startNull = false;
for (int k = afterIndex; k < terms.length; k++) {
if (!startNull && k > start) {
startNull = true;
}
if (startNull) {
termsToOutput[k] = null;
} else {
if (termsToOutput[k] == null) {
start++;
}
}
}
}
// join everything together with spaces while skipping null offsets
return joiner.join(termsToOutput);
} else {
if (!trim) {
// join everything together with spaces while skipping null offsets
return joiner.join(termsToOutput);
} else {
// join everything together with spaces while skipping null offsets (after trimming both sides down to what we need)
return joiner.join(bothTrim(termsToOutput));
}
}
}

return joiner.join(largestTerms);
}

/**
* Get the longest term from a list of terms;
*
* @param terms
* the terms to create a phrase
* @return the longest term (null if empty or null list)
*/
protected String getLongestTerm(List<String> terms) {
if (terms == null || terms.isEmpty()) {
return null;
} else {
return terms.stream().max(Comparator.comparingInt(String::length)).get();
private String[] bothTrim(String[] termsToOutput) {
int expandedMid = (endOffset - startOffset) / 2;
int start = (int) (expandedMid - origHalfSize);
boolean startNull = false;
for (int k = expandedMid; k >= 0; k--) {
if (!startNull && k < start) {
startNull = true;
}
if (startNull) {
termsToOutput[k] = null;
} else {
if (termsToOutput[k] == null) {
start--;
}
}
}
start = (int) (expandedMid + origHalfSize);
startNull = false;
for (int k = expandedMid; k < termsToOutput.length; k++) {
if (!startNull && k > start) {
startNull = true;
}
if (startNull) {
termsToOutput[k] = null;
} else {
if (termsToOutput[k] == null) {
start++;
}
}
}
return termsToOutput;
}

/**
Expand Down Expand Up @@ -471,17 +606,26 @@ private String getDtUid(String str) {
}
}

public void setHitTermsList(List<String> hitTermsList) {
this.hitTermsList = (ArrayList<String>) hitTermsList;
}

public void setDirection(String direction) {
this.direction = direction;
}

public void setOrigHalfSize(float origHalfSize) {
this.origHalfSize = origHalfSize;
}

public void setTrim(boolean trim) {
this.trim = trim;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("TermFrequencyExcerptIterator: ");
sb.append(this.fieldName);
sb.append(", ");
sb.append(this.startOffset);
sb.append(", ");
sb.append(this.endOffset);

return sb.toString();

return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
}

}

0 comments on commit f25999b

Please sign in to comment.