Skip to content

Commit

Permalink
Update hit term extractor to use scores when considering hit terms
Browse files Browse the repository at this point in the history
documentation pt1

format

documentation pt2

format

set retry to 0 and add log for testing

wordsandscores test

testing

format

fix documentation and add new method

more tests, fix edge case, documentation

test, log, tostring

more methods, more tests

add getArrSize

pass skips on end of cq

squash me

squash me

working except for excerptTest

fix test

squash me

fix before/after
  • Loading branch information
austin007008 committed Apr 9, 2024
1 parent 6730346 commit 216f3c5
Show file tree
Hide file tree
Showing 8 changed files with 824 additions and 105 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
Expand Down Expand Up @@ -66,6 +65,19 @@ public class TermFrequencyExcerptIterator implements SortedKeyValueIterator<Key,
// the top value
protected Value tv;

// how many words skipped in the lower half of the excerpt
private int leftSkippedWords;
// how many words skipped in the upper half of the excerpt
private int rightSkippedWords;
// the list of hit terms
protected ArrayList<String> hitTermsList;
// the direction for the excerpt
private String direction;

private static final String BEFORE = "BEFORE";
private static final String AFTER = "AFTER";
private static final String BOTH = "BOTH";

@Override
public IteratorOptions describeOptions() {
IteratorOptions options = new IteratorOptions(TermFrequencyExcerptIterator.class.getSimpleName(),
Expand Down Expand Up @@ -135,6 +147,10 @@ public void init(SortedKeyValueIterator<Key,Value> source, Map<String,String> op
this.startOffset = Integer.parseInt(options.get(START_OFFSET));
this.endOffset = Integer.parseInt(options.get(END_OFFSET));
this.fieldName = options.get(FIELD_NAME);
leftSkippedWords = 0;
rightSkippedWords = 0;
hitTermsList = new ArrayList<>();
direction = BOTH;
}

@Override
Expand Down Expand Up @@ -264,7 +280,8 @@ public void next() throws IOException {
Text cv = top.getColumnVisibility();
long ts = top.getTimestamp();
Text row = top.getRow();
List<String>[] terms = new List[endOffset - startOffset];
// set the size of the array to the amount of terms we need to choose
WordsAndScores[] terms = new WordsAndScores[endOffset - startOffset];

// while we have term frequencies for the same document
while (source.hasTop() && dtUid.equals(getDtUidFromTfKey(source.getTopKey()))) {
Expand All @@ -278,6 +295,8 @@ public void next() throws IOException {
try {
// parse the offsets from the value
TermWeight.Info info = TermWeight.Info.parseFrom(source.getTopValue().get());
// check if the number of scores is equal to the number of offsets
boolean useScores = info.getScoreCount() == info.getTermOffsetCount();

// for each offset, gather all the terms in our range
for (int i = 0; i < info.getTermOffsetCount(); i++) {
Expand All @@ -286,12 +305,20 @@ public void next() throws IOException {
if (offset >= startOffset && offset < endOffset) {
// calculate the index in our value list
int index = offset - startOffset;
// if the value is larger than the value for this offset thus far
// if the current index has no words/scores yet, initialize an object at the index
if (terms[index] == null) {
terms[index] = new ArrayList<>();
terms[index] = new WordsAndScores();
}
// if we are using scores, add the word and score to the object, if not then only add the word
if (useScores) {
if (info.getScore(i) < 0) {
terms[index].addTerm(fieldAndValue[1], hitTermsList);
} else {
terms[index].addTerm(fieldAndValue[1], info.getScore(i), hitTermsList);
}
} else {
terms[index].addTerm(fieldAndValue[1], hitTermsList);
}
// use this value
terms[index].add(fieldAndValue[1]);
}
}
} catch (InvalidProtocolBufferException e) {
Expand All @@ -302,9 +329,10 @@ public void next() throws IOException {
// get the next term frequency
source.next();
}

// generate the return key and value
tk = new Key(row, new Text(dtUid), new Text(fieldName + Constants.NULL + generatePhrase(terms)), cv, ts);
tk = new Key(row, new Text(dtUid),
new Text(fieldName + Constants.NULL + generatePhrase(terms) + Constants.NULL + leftSkippedWords + Constants.NULL + rightSkippedWords),
cv, ts);
tv = new Value();
}

Expand All @@ -315,28 +343,51 @@ public void next() throws IOException {
* the terms to create a phrase from
* @return the phrase
*/
protected String generatePhrase(List<String>[] terms) {
String[] largestTerms = new String[terms.length];
protected String generatePhrase(WordsAndScores[] terms) {
// create an array with the same length as the one we just passed in
String[] termsToOutput = new String[terms.length];
boolean bef = direction.equals(BEFORE);
boolean aft = direction.equals(AFTER);
boolean lock = false;
int beforeIndex = -1;
int afterIndex = -1;
for (int i = 0; i < terms.length; i++) {
largestTerms[i] = getLongestTerm(terms[i]);
// if there is nothing at this position, put nothing in the position in the new one
if (terms[i] == null) {
termsToOutput[i] = null;
} else {
// if the term to put in this position is in the stop list, increment the corresponding *skippedWords and put nothing in this position
if (terms[i].getWordToOutput() == null) {
if (i <= (terms.length / 2)) {
leftSkippedWords++;
} else {
rightSkippedWords++;
}
termsToOutput[i] = null;
} else {
// if the term to output is valid, put it in the same position in the new array
termsToOutput[i] = terms[i].getWordToOutput();
if ((bef || aft) && (terms[i].getHasHitTerm())) {
if (!lock) {
afterIndex = i;
lock = true;
}
beforeIndex = i;
}
}
}
}

return joiner.join(largestTerms);
}

/**
* Get the longest term from a list of terms;
*
* @param terms
* the terms to create a phrase
* @return the longest term (null if empty or null list)
*/
protected String getLongestTerm(List<String> terms) {
if (terms == null || terms.isEmpty()) {
return null;
} else {
return terms.stream().max(Comparator.comparingInt(String::length)).get();
if (bef) { // if direction is "before", set everything after the last hit term to null
for (int k = beforeIndex + 1; k < terms.length; k++) {
termsToOutput[k] = null;
}
} else if (aft) { // if direction is "after", set everything before the first hit term to null
for (int k = 0; k < afterIndex; k++) {
termsToOutput[k] = null;
}
}
// join everything together with spaces while skipping null offsets
return joiner.join(termsToOutput);
}

/**
Expand Down Expand Up @@ -471,17 +522,18 @@ private String getDtUid(String str) {
}
}

public void setHitTermsList(List<String> hitTermsList) {
this.hitTermsList = (ArrayList<String>) hitTermsList;
}

public void setDirection(String direction) {
this.direction = direction;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("TermFrequencyExcerptIterator: ");
sb.append(this.fieldName);
sb.append(", ");
sb.append(this.startOffset);
sb.append(", ");
sb.append(this.endOffset);

return sb.toString();

return "TermFrequencyExcerptIterator: " + this.fieldName + ", " + this.startOffset + ", " + this.endOffset;
}

}

0 comments on commit 216f3c5

Please sign in to comment.