Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LUCENE-9476 Add getBulkPath API for the Taxonomy index #2247

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Expand Up @@ -7,6 +7,9 @@ http://s.apache.org/luceneversions

New Features

* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
facet ordinals at once (Gautam Worah, Mike McCandless)

* LUCENE-9322: Vector-valued fields, Lucene90 Codec (Mike Sokolov, Julie Tibshirani, Tomoko Uchida)

* LUCENE-9004: Approximate nearest vector search via NSW graphs
Expand Down
Expand Up @@ -18,10 +18,12 @@

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.function.IntUnaryOperator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
Expand All @@ -31,7 +33,7 @@
import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
import org.apache.lucene.facet.taxonomy.TaxonomyReader;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.CorruptIndexException; // javadocs
import org.apache.lucene.index.CorruptIndexException;
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.LeafReader;
Expand All @@ -46,6 +48,7 @@
import org.apache.lucene.util.Accountables;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.InPlaceMergeSorter;
import org.apache.lucene.util.RamUsageEstimator;

/**
Expand Down Expand Up @@ -320,23 +323,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
// instance recognizes. Therefore we do this check up front, before we hit
// the cache.
if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
return null;
}
int indexReaderMaxDoc = indexReader.maxDoc();
checkOrdinalBounds(ordinal, indexReaderMaxDoc);

// TODO: can we use an int-based hash impl, such as IntToObjectMap,
// wrapped as LRU?
Integer catIDInteger = Integer.valueOf(ordinal);
synchronized (categoryCache) {
FacetLabel res = categoryCache.get(catIDInteger);
if (res != null) {
return res;
}
FacetLabel ordinalPath = getPathFromCache(ordinal);
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved
if (ordinalPath != null) {
return ordinalPath;
}

int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
// TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);

FacetLabel ret;
Expand All @@ -353,12 +349,137 @@ public FacetLabel getPath(int ordinal) throws IOException {
}

synchronized (categoryCache) {
categoryCache.put(catIDInteger, ret);
categoryCache.put(ordinal, ret);
}

return ret;
}

private FacetLabel getPathFromCache(int ordinal) {
// TODO: can we use an int-based hash impl, such as IntToObjectMap,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh that is a great idea, and low-hanging fruit, and would greatly reduce the RAM usage for this cache.

I think DirectoryTaxonomyWriter also has such a cache that we could change to a native map.

Could you open a spinoff issue?

// wrapped as LRU?
synchronized (categoryCache) {
return categoryCache.get(ordinal);
}
}

private void checkOrdinalBounds(int ordinal, int indexReaderMaxDoc)
throws IllegalArgumentException {
if (ordinal < 0 || ordinal >= indexReaderMaxDoc) {
throw new IllegalArgumentException(
"ordinal "
+ ordinal
+ " is out of the range of the indexReader "
+ indexReader.toString());
}
}

/**
* Returns an array of FacetLabels for a given array of ordinals.
*
* <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
* ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
* was created using StoredFields (with no performance gains) and uses DocValues based iteration
* when the index is based on DocValues.
*
* @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
* index
*/
public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
ensureOpen();

int ordinalsLength = ordinals.length;
FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
// remember the original positions of ordinals before they are sorted
int originalPosition[] = new int[ordinalsLength];
Arrays.setAll(originalPosition, IntUnaryOperator.identity());
int indexReaderMaxDoc = indexReader.maxDoc();

for (int i = 0; i < ordinalsLength; i++) {
// check whether the ordinal is valid before accessing the cache
checkOrdinalBounds(ordinals[i], indexReaderMaxDoc);
// check the cache before trying to find it in the index
FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
if (ordinalPath != null) {
bulkPath[i] = ordinalPath;
}
}

// parallel sort the ordinals and originalPosition array based on the values in the ordinals
// array
new InPlaceMergeSorter() {
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved
@Override
protected void swap(int i, int j) {
int x = ordinals[i];
ordinals[i] = ordinals[j];
ordinals[j] = x;

x = originalPosition[i];
originalPosition[i] = originalPosition[j];
originalPosition[j] = x;
}
;
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved

@Override
public int compare(int i, int j) {
return Integer.compare(ordinals[i], ordinals[j]);
}
}.sort(0, ordinalsLength);

int readerIndex;
int leafReaderMaxDoc = 0;
int leafReaderDocBase = 0;
LeafReader leafReader;
LeafReaderContext leafReaderContext;
BinaryDocValues values = null;

for (int i = 0; i < ordinalsLength; i++) {
if (bulkPath[originalPosition[i]] == null) {
if (values == null || ordinals[i] >= leafReaderMaxDoc) {

readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
leafReaderContext = indexReader.leaves().get(readerIndex);
leafReader = leafReaderContext.reader();
leafReaderMaxDoc = leafReader.maxDoc();
leafReaderDocBase = leafReaderContext.docBase;
values = leafReader.getBinaryDocValues(Consts.FULL);

// this check is only needed once to confirm that the index uses BinaryDocValues
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
if (success == false) {
return getBulkPathForOlderIndexes(ordinals);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I'm confused -- wouldn't an older index have no BinaryDocValues field? So, values would be null, and we should fallback then?

This code should hit NullPointerException on an old index I think? How come our backwards compatibility test didn't expose this?

}
}
boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
assert success;
bulkPath[originalPosition[i]] =
new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
}
}

for (int i = 0; i < ordinalsLength; i++) {
synchronized (categoryCache) {
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved
categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will sometimes put ordinals back into the cache that were already there at the start of this method right? I guess that's harmless. Or, maybe we should move this up above? Then we can do it only for those ordinals that were not already cached?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think intuitively adding the ordinals back into the cache would not be a problem. This should also (theoretically) be faster than trying to get the lock again and again in a loop?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should also (theoretically) be faster than trying to get the lock again and again in a loop?

Hmm, I'm confused: this code is already getting the lock inside a for loop? I guess we could move the synchronized outside of the for loop? Or, maybe javac is doing this for us already? But let's make it explicit, or, let's just merge this for loop with the one before (and keep acquiring the lock inside the for loop)? One big benefit of the latter approach is that if all of the ordinals were already cached (hopefully typically a common case), we do not need any locking, but with this approach, we still do.

}
}

return bulkPath;
}

/**
* This function is only used when the underlying taxonomy index was constructed using an older
* (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it
* internally when it realizes that the index uses StoredFields.
*/
private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException {
FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
for (int i = 0; i < ordinals.length; i++) {
bulkPath[i] = getPath(ordinals[i]);
}

return bulkPath;
}

@Override
public int getSize() {
ensureOpen();
Expand Down
Expand Up @@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
}
}
// (also test invalid ordinals:)
assertNull(tr.getPath(-1));
assertNull(tr.getPath(tr.getSize()));
assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));

// test TaxonomyReader.getOrdinal():
for (int i = 1; i < expectedCategories.length; i++) {
Expand Down
Expand Up @@ -84,6 +84,31 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
dir.close();
}

// Opens up a pre-existing index and tries to run getBulkPath on it
public void testGetBulkPathOnOlderCodec() throws Exception {
Path indexDir = createTempDir(oldTaxonomyIndexName);
TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
Directory dir = newFSDirectory(indexDir);

DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);

FacetLabel cp_b = new FacetLabel("b");
writer.addCategory(cp_b);
writer.getInternalIndexWriter().forceMerge(1);
writer.commit();

DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);

FacetLabel[] facetLabels = {new FacetLabel("a"), cp_b};
int[] ordinals =
new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
// The zip file already contains a category "a" stored with the older StoredFields codec
assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
reader.close();
writer.close();
dir.close();
}

// Used to create a fresh taxonomy index with StoredFields
@Ignore
public void testCreateOldTaxonomy() throws IOException {
Expand Down
Expand Up @@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {

// check that r1 doesn't see cp_b
assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
assertNull(r1.getPath(2));
expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));

r1.close();
r2.close();
Expand Down Expand Up @@ -569,4 +569,39 @@ public void testAccountable() throws Exception {
taxoReader.close();
dir.close();
}

public void testCallingBulkPathReturnsCorrectResult() throws Exception {
float PROBABILITY_OF_COMMIT = 0.5f;
Directory src = newDirectory();
DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
String randomArray[] = new String[random().nextInt(1000)];
// adding a smaller bound on ints ensures that we will have some duplicate ordinals in random
// test cases
Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500)));

FacetLabel allPaths[] = new FacetLabel[randomArray.length];
int allOrdinals[] = new int[randomArray.length];

for (int i = 0; i < randomArray.length; i++) {
allPaths[i] = new FacetLabel(randomArray[i]);
w.addCategory(allPaths[i]);
gautamworah96 marked this conversation as resolved.
Show resolved Hide resolved
// add random commits to create multiple segments in the index
if (random().nextFloat() < PROBABILITY_OF_COMMIT) {
w.commit();
}
}
w.commit();
w.close();

DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);

for (int i = 0; i < allPaths.length; i++) {
allOrdinals[i] = r1.getOrdinal(allPaths[i]);
}

FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
assertArrayEquals(allPaths, allBulkPaths);
r1.close();
src.close();
}
}