apache · gautamworah96 · Jan 26, 2021 · Jan 26, 2021 · Jan 29, 2021 · Feb 8, 2021
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -7,6 +7,9 @@ http://s.apache.org/luceneversions
 
 New Features
 
+* LUCENE-9476: Add new getBulkPath API to DirectoryTaxonomyReader to more efficiently retrieve FacetLabels for multiple
+  facet ordinals at once (Gautam Worah, Mike McCandless)
+
 * LUCENE-9322: Vector-valued fields, Lucene90 Codec (Mike Sokolov, Julie Tibshirani, Tomoko Uchida)
 
 * LUCENE-9004: Approximate nearest vector search via NSW graphs

diff --git a/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java b/...ne/facet/src/java/org/apache/lucene/facet/taxonomy/directory/DirectoryTaxonomyReader.java
@@ -18,10 +18,12 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.function.IntUnaryOperator;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 import org.apache.lucene.document.Document;
@@ -31,7 +33,7 @@
 import org.apache.lucene.facet.taxonomy.ParallelTaxonomyArrays;
 import org.apache.lucene.facet.taxonomy.TaxonomyReader;
 import org.apache.lucene.index.BinaryDocValues;
-import org.apache.lucene.index.CorruptIndexException; // javadocs
+import org.apache.lucene.index.CorruptIndexException;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.LeafReader;
@@ -46,6 +48,7 @@
 import org.apache.lucene.util.Accountables;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.InPlaceMergeSorter;
 import org.apache.lucene.util.RamUsageEstimator;
 
 /**
@@ -320,23 +323,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
     // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
     // instance recognizes. Therefore we do this check up front, before we hit
     // the cache.
-    if (ordinal < 0 || ordinal >= indexReader.maxDoc()) {
-      return null;
-    }
+    int indexReaderMaxDoc = indexReader.maxDoc();
+    checkOrdinalBounds(ordinal, indexReaderMaxDoc);
 
-    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
-    // wrapped as LRU?
-    Integer catIDInteger = Integer.valueOf(ordinal);
-    synchronized (categoryCache) {
-      FacetLabel res = categoryCache.get(catIDInteger);
-      if (res != null) {
-        return res;
-      }
+    FacetLabel ordinalPath = getPathFromCache(ordinal);
+    if (ordinalPath != null) {
+      return ordinalPath;
     }
 
     int readerIndex = ReaderUtil.subIndex(ordinal, indexReader.leaves());
     LeafReader leafReader = indexReader.leaves().get(readerIndex).reader();
-    // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
     BinaryDocValues values = leafReader.getBinaryDocValues(Consts.FULL);
 
     FacetLabel ret;
@@ -353,12 +349,137 @@ public FacetLabel getPath(int ordinal) throws IOException {
     }
 
     synchronized (categoryCache) {
-      categoryCache.put(catIDInteger, ret);
+      categoryCache.put(ordinal, ret);
     }
 
     return ret;
   }
 
+  private FacetLabel getPathFromCache(int ordinal) {
+    // TODO: can we use an int-based hash impl, such as IntToObjectMap,
+    // wrapped as LRU?
+    synchronized (categoryCache) {
+      return categoryCache.get(ordinal);
+    }
+  }
+
+  private void checkOrdinalBounds(int ordinal, int indexReaderMaxDoc)
+      throws IllegalArgumentException {
+    if (ordinal < 0 || ordinal >= indexReaderMaxDoc) {
+      throw new IllegalArgumentException(
+          "ordinal "
+              + ordinal
+              + " is out of the range of the indexReader "
+              + indexReader.toString());
+    }
+  }
+
+  /**
+   * Returns an array of FacetLabels for a given array of ordinals.
+   *
+   * <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
+   * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
+   * was created using StoredFields (with no performance gains) and uses DocValues based iteration
+   * when the index is based on DocValues.
+   *
+   * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
+   *     index
+   */
+  public FacetLabel[] getBulkPath(int... ordinals) throws IOException {
+    ensureOpen();
+
+    int ordinalsLength = ordinals.length;
+    FacetLabel[] bulkPath = new FacetLabel[ordinalsLength];
+    // remember the original positions of ordinals before they are sorted
+    int originalPosition[] = new int[ordinalsLength];
+    Arrays.setAll(originalPosition, IntUnaryOperator.identity());
+    int indexReaderMaxDoc = indexReader.maxDoc();
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      // check whether the ordinal is valid before accessing the cache
+      checkOrdinalBounds(ordinals[i], indexReaderMaxDoc);
+      // check the cache before trying to find it in the index
+      FacetLabel ordinalPath = getPathFromCache(ordinals[i]);
+      if (ordinalPath != null) {
+        bulkPath[i] = ordinalPath;
+      }
+    }
+
+    // parallel sort the ordinals and originalPosition array based on the values in the ordinals
+    // array
+    new InPlaceMergeSorter() {
+      @Override
+      protected void swap(int i, int j) {
+        int x = ordinals[i];
+        ordinals[i] = ordinals[j];
+        ordinals[j] = x;
+
+        x = originalPosition[i];
+        originalPosition[i] = originalPosition[j];
+        originalPosition[j] = x;
+      }
+      ;
+
+      @Override
+      public int compare(int i, int j) {
+        return Integer.compare(ordinals[i], ordinals[j]);
+      }
+    }.sort(0, ordinalsLength);
+
+    int readerIndex;
+    int leafReaderMaxDoc = 0;
+    int leafReaderDocBase = 0;
+    LeafReader leafReader;
+    LeafReaderContext leafReaderContext;
+    BinaryDocValues values = null;
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      if (bulkPath[originalPosition[i]] == null) {
+        if (values == null || ordinals[i] >= leafReaderMaxDoc) {
+
+          readerIndex = ReaderUtil.subIndex(ordinals[i], indexReader.leaves());
+          leafReaderContext = indexReader.leaves().get(readerIndex);
+          leafReader = leafReaderContext.reader();
+          leafReaderMaxDoc = leafReader.maxDoc();
+          leafReaderDocBase = leafReaderContext.docBase;
+          values = leafReader.getBinaryDocValues(Consts.FULL);
+
+          // this check is only needed once to confirm that the index uses BinaryDocValues
+          boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
+          if (success == false) {
+            return getBulkPathForOlderIndexes(ordinals);
+          }
+        }
+        boolean success = values.advanceExact(ordinals[i] - leafReaderDocBase);
+        assert success;
+        bulkPath[originalPosition[i]] =
+            new FacetLabel(FacetsConfig.stringToPath(values.binaryValue().utf8ToString()));
+      }
+    }
+
+    for (int i = 0; i < ordinalsLength; i++) {
+      synchronized (categoryCache) {
+        categoryCache.put(ordinals[i], bulkPath[originalPosition[i]]);
+      }
+    }
+
+    return bulkPath;
+  }
+
+  /**
+   * This function is only used when the underlying taxonomy index was constructed using an older
+   * (slower) StoredFields based codec (< 8.7). The {@link #getBulkPath(int...)} function calls it
+   * internally when it realizes that the index uses StoredFields.
+   */
+  private FacetLabel[] getBulkPathForOlderIndexes(int... ordinals) throws IOException {
+    FacetLabel[] bulkPath = new FacetLabel[ordinals.length];
+    for (int i = 0; i < ordinals.length; i++) {
+      bulkPath[i] = getPath(ordinals[i]);
+    }
+
+    return bulkPath;
+  }
+
   @Override
   public int getSize() {
     ensureOpen();

diff --git a/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java b/lucene/facet/src/test/org/apache/lucene/facet/taxonomy/TestTaxonomyCombined.java
@@ -366,9 +366,9 @@ public void testReaderBasic() throws Exception {
       }
     }
     //  (also test invalid ordinals:)
-    assertNull(tr.getPath(-1));
-    assertNull(tr.getPath(tr.getSize()));
-    assertNull(tr.getPath(TaxonomyReader.INVALID_ORDINAL));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(-1));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(tr.getSize()));
+    expectThrows(IllegalArgumentException.class, () -> tr.getPath(TaxonomyReader.INVALID_ORDINAL));
 
     // test TaxonomyReader.getOrdinal():
     for (int i = 1; i < expectedCategories.length; i++) {

diff --git a/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java b/...facet/src/test/org/apache/lucene/facet/taxonomy/directory/TestBackwardsCompatibility.java
@@ -84,6 +84,31 @@ private void createNewTaxonomyIndex(String dirName) throws IOException {
     dir.close();
   }
 
+  // Opens up a pre-existing index and tries to run getBulkPath on it
+  public void testGetBulkPathOnOlderCodec() throws Exception {
+    Path indexDir = createTempDir(oldTaxonomyIndexName);
+    TestUtil.unzip(getDataInputStream(oldTaxonomyIndexName + ".zip"), indexDir);
+    Directory dir = newFSDirectory(indexDir);
+
+    DirectoryTaxonomyWriter writer = new DirectoryTaxonomyWriter(dir);
+
+    FacetLabel cp_b = new FacetLabel("b");
+    writer.addCategory(cp_b);
+    writer.getInternalIndexWriter().forceMerge(1);
+    writer.commit();
+
+    DirectoryTaxonomyReader reader = new DirectoryTaxonomyReader(writer);
+
+    FacetLabel[] facetLabels = {new FacetLabel("a"), cp_b};
+    int[] ordinals =
+        new int[] {reader.getOrdinal(facetLabels[0]), reader.getOrdinal(facetLabels[1])};
+    // The zip file already contains a category "a" stored with the older StoredFields codec
+    assertArrayEquals(facetLabels, reader.getBulkPath(ordinals));
+    reader.close();
+    writer.close();
+    dir.close();
+  }
+
   // Used to create a fresh taxonomy index with StoredFields
   @Ignore
   public void testCreateOldTaxonomy() throws IOException {

diff --git a/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java b/...acet/src/test/org/apache/lucene/facet/taxonomy/directory/TestDirectoryTaxonomyReader.java
@@ -413,7 +413,7 @@ public void testOpenIfChangedReuse() throws Exception {
 
       // check that r1 doesn't see cp_b
       assertEquals(TaxonomyReader.INVALID_ORDINAL, r1.getOrdinal(cp_b));
-      assertNull(r1.getPath(2));
+      expectThrows(IllegalArgumentException.class, () -> r1.getPath(2));
 
       r1.close();
       r2.close();
@@ -569,4 +569,39 @@ public void testAccountable() throws Exception {
     taxoReader.close();
     dir.close();
   }
+
+  public void testCallingBulkPathReturnsCorrectResult() throws Exception {
+    float PROBABILITY_OF_COMMIT = 0.5f;
+    Directory src = newDirectory();
+    DirectoryTaxonomyWriter w = new DirectoryTaxonomyWriter(src);
+    String randomArray[] = new String[random().nextInt(1000)];
+    // adding a smaller bound on ints ensures that we will have some duplicate ordinals in random
+    // test cases
+    Arrays.setAll(randomArray, i -> Integer.toString(random().nextInt(500)));
+
+    FacetLabel allPaths[] = new FacetLabel[randomArray.length];
+    int allOrdinals[] = new int[randomArray.length];
+
+    for (int i = 0; i < randomArray.length; i++) {
+      allPaths[i] = new FacetLabel(randomArray[i]);
+      w.addCategory(allPaths[i]);
+      // add random commits to create multiple segments in the index
+      if (random().nextFloat() < PROBABILITY_OF_COMMIT) {
+        w.commit();
+      }
+    }
+    w.commit();
+    w.close();
+
+    DirectoryTaxonomyReader r1 = new DirectoryTaxonomyReader(src);
+
+    for (int i = 0; i < allPaths.length; i++) {
+      allOrdinals[i] = r1.getOrdinal(allPaths[i]);
+    }
+
+    FacetLabel allBulkPaths[] = r1.getBulkPath(allOrdinals);
+    assertArrayEquals(allPaths, allBulkPaths);
+    r1.close();
+    src.close();
+  }
 }