apache · risdenk · Apr 4, 2022 · Dec 21, 2022 · Mar 3, 2023 · Mar 14, 2023
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
@@ -42,12 +42,17 @@ Bug Fixes
 * SOLR-17098: ZK Credentials and ACLs are no longer sent to all ZK Servers when using Streaming Expressions.
   They will only be used when sent to the default ZK Host. (Houston Putman, Jan Høydahl, David Smiley, Gus Heck, Qing Xu)
 
+* SOLR-16585: Fixed NPE when paginating MatchAllDocs with non-zero start offset, like q=*:*&start=10 (Michael Gibney)
+
 Optimizations
 ---------------------
 * SOLR-16555: SolrIndexSearcher - FilterCache intersections/andNot should not clone bitsets repeatedly (Kevin Risden, David Smiley)
 
 * SOLR-17004: ZkStateReader waitForState should check clusterState before using watchers (Kevin Risden)
 
+* SOLR-14765: Optimize DocList creation for sort-irrelevant cases. This issue also reworks the building and caching of liveDocs
+  in SolrIndexSearcher, and refines/clarifies the effect of `useFilterForSortedQuery` (Michael Gibney, Mike Drob, David Smiley)
+
 Other Changes
 ---------------------
 * SOLR-16141: Upgrade Apache Tika to 1.28.4 (Kevin Risden)

diff --git a/solr/core/src/java/org/apache/solr/search/DocSetUtil.java b/solr/core/src/java/org/apache/solr/search/DocSetUtil.java
@@ -31,12 +31,12 @@
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.LeafCollector;
+import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.util.Bits;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.FixedBitSet;
-import org.apache.solr.common.SolrException;
 
 /** @lucene.experimental */
 public class DocSetUtil {
@@ -76,16 +76,10 @@ public static boolean equals(DocSet a, DocSet b) {
    * @lucene.experimental
    */
   public static DocSet getDocSet(DocSetCollector collector, SolrIndexSearcher searcher) {
-    if (collector.size() == searcher.numDocs()) {
-      if (!searcher.isLiveDocsInstantiated()) {
-        searcher.setLiveDocs( collector.getDocSet() );
-      }
-      try {
-        return searcher.getLiveDocSet();
-      } catch (IOException e) {
-        // should be impossible... liveDocs should exist, so no IO should be necessary
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      }
+    final int size = collector.size();
+    if (size == searcher.numDocs()) {
+      // see comment under similar block in `getDocSet(DocSet, SolrIndexSearcher)`
+      return searcher.offerLiveDocs(collector::getDocSet, size);
     }
 
     return collector.getDocSet();
@@ -97,18 +91,15 @@ public static DocSet getDocSet(DocSetCollector collector, SolrIndexSearcher sear
    * @lucene.experimental
    */
   public static DocSet getDocSet(DocSet docs, SolrIndexSearcher searcher) {
-    if (docs.size() == searcher.numDocs()) {
-      if (!searcher.isLiveDocsInstantiated()) {
-        searcher.setLiveDocs( docs );
-      }
-      try {
-        // if this docset has the same cardinality as liveDocs, return liveDocs instead
-        // so this set will be short lived garbage.
-        return searcher.getLiveDocSet();
-      } catch (IOException e) {
-        // should be impossible... liveDocs should exist, so no IO should be necessary
-        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
-      }
+    final int size = docs.size();
+    if (size == searcher.numDocs()) {
+      // if this docset has the same cardinality as liveDocs, return liveDocs instead
+      // so this set will be short lived garbage.
+      // This could be a very broad query, or some unusual way of running MatchAllDocsQuery?
+      // In any event, we already _have_ a viable `liveDocs` DocSet, so offer it to the
+      // SolrIndexSearcher, and use whatever canonical `liveDocs` instance the searcher returns
+      // (which may or may not be derived from the set that we offered)
+      return searcher.offerLiveDocs(() -> docs, size);
     }
 
     return docs;
@@ -129,6 +120,10 @@ public static DocSet createDocSet(SolrIndexSearcher searcher, Query query, DocSe
       DocSet set = ((DocSetProducer) query).createDocSet(searcher);
       // assert equals(set, createDocSetGeneric(searcher, query));
       return set;
+    } else if (query instanceof MatchAllDocsQuery) {
+      DocSet set = searcher.getLiveDocSet();
+      // assert equals(set, createDocSetGeneric(searcher, query));
+      return set;
     }
 
     return createDocSetGeneric(searcher, query);
@@ -279,4 +274,45 @@ public static void collectSortedDocSet(DocSet docs, IndexReader reader, Collecto
     }
   }
 
-}
+  /**
+   * Utility method to copy a specified range of {@link Bits} to a specified offset in a destination
+   * {@link FixedBitSet}. This can be useful, e.g., for translating per-segment bits ranges to
+   * composite DocSet bits ranges.
+   *
+   * @param src source Bits
+   * @param srcOffset start offset (inclusive) in source Bits
+   * @param srcLimit end offset (exclusive) in source Bits
+   * @param dest destination FixedBitSet
+   * @param destOffset start offset of range in destination
+   */
+  static void copyTo(
+      Bits src, final int srcOffset, int srcLimit, FixedBitSet dest, int destOffset) {
+    /*
+    NOTE: `adjustedSegDocBase` +1 to compensate for the fact that `segOrd` always has to "read
+    ahead" by 1. Adding 1 to set `adjustedSegDocBase` once allows us to use `segOrd` as-is (with
+    no "pushback") for both `startIndex` and `endIndex` args to `dest.set(startIndex, endIndex)`
+     */
+    final int adjustedSegDocBase = destOffset - srcOffset + 1;
+    int segOrd = srcLimit;
+    do {
+      /*
+      NOTE: we check deleted range before live range in the outer loop in order to not have
+      to explicitly guard against `dest.set(maxDoc, maxDoc)` in the event that the global max doc
+      is a delete (this case would trigger a bounds-checking AssertionError in
+      `FixedBitSet.set(int, int)`).
+       */
+      do {
+        // consume deleted range
+        if (--segOrd < srcOffset) {
+          // we're currently in a "deleted" run, so just return; no need to do anything further
+          return;
+        }
+      } while (!src.get(segOrd));
+      final int limit = segOrd; // set highest ord (inclusive) of live range
+      while (segOrd-- > srcOffset && src.get(segOrd)) {
+        // consume live range
+      }
+      dest.set(adjustedSegDocBase + segOrd, adjustedSegDocBase + limit);
+    } while (segOrd > srcOffset);
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/search/QueryUtils.java b/solr/core/src/java/org/apache/solr/search/QueryUtils.java
@@ -24,6 +24,7 @@
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.ConstantScoreQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MatchNoDocsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrException;
 
@@ -44,6 +45,36 @@ public static boolean isNegative(Query q) {
     return true;
   }
 
+  /**
+   * Recursively unwraps the specified query to determine whether it is capable of producing a score
+   * that varies across different documents. Returns true if this query is not capable of producing
+   * a varying score (i.e., it is a constant score query).
+   */
+  public static boolean isConstantScoreQuery(Query q) {
+    while (true) {
+      if (q instanceof BoostQuery) {
+        q = ((BoostQuery) q).getQuery();
+      } else if (q instanceof WrappedQuery) {
+        q = ((WrappedQuery) q).getWrappedQuery();
+      } else if (q instanceof ConstantScoreQuery) {
+        return true;
+      } else if (q instanceof MatchAllDocsQuery) {
+        return true;
+      } else if (q instanceof MatchNoDocsQuery) {
+        return true;
+      } else if (q instanceof BooleanQuery) {
+        // NOTE: this check can be very simple because:
+        //  1. there's no need to check `q == clause.getQuery()` because BooleanQuery is final, with
+        //     a builder that prevents direct loops.
+        //  2. we don't bother recursing to second-guess a nominally "scoring" clause that actually
+        //     wraps a constant-score query.
+        return ((BooleanQuery) q).clauses().stream().noneMatch(BooleanClause::isScoring);
+      } else {
+        return false;
+      }
+    }
+  }
+
   /** Returns the original query if it was already a positive query, otherwise
    * return the negative of the query (i.e., a positive query).
    * <p>
@@ -155,6 +186,13 @@ public static Query combineQueryAndFilter(Query scoreQuery, Query filterQuery) {
       if (filterQuery == null) {
         return new MatchAllDocsQuery(); // default if nothing -- match everything
       } else {
+        /*
+        NOTE: we _must_ wrap filter in a ConstantScoreQuery (default score `1f`) in order to
+        guarantee score parity with the actual user-specified scoreQuery (i.e., MatchAllDocsQuery).
+        This should only matter if score is _explicitly_ requested to be returned, but we don't know
+        that here, and it's probably not worth jumping through the necessary hoops simply to avoid
+        wrapping in the case where `true==isConstantScoreQuery(filterQuery)`
+         */
         return new ConstantScoreQuery(filterQuery);
       }
     } else {