diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java index 839e7d55595..844c8aa9908 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java @@ -118,10 +118,13 @@ public class ElasticIndexDefinition extends IndexDefinition { // MLT queries, when no fields are specified, do not use the entire document but only a maximum of // max_query_terms (default 25). Even increasing this value, the query could produce not so relevant // results (eg: based on the :fulltext content). To work this around, we can specify DYNAMIC_BOOST_FULLTEXT - // field as first field since it usually contains relevant terms. This will make sure that the MLT queries - // give more priority to the terms in this field while the rest (*) are considered secondary. + // field with overridden mlt params and increased boost since it usually contains relevant terms. This will make sure + // that the MLT queries give more priority to the terms in this field while the rest (*) are considered secondary. + // TODO: we can further improve search relevance by using the actual tags combined with the weights using a function query. + // Right now, we are just matching the tags without looking at the weights. Therefore, a tag can be matched in a field with a lower weight. private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new String[] { - DYNAMIC_BOOST_FULLTEXT, "*" + "mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3", + "mlt.fl=*&mlt.mintf=2" }; private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost"; diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java index 9a3dee6ea8e..7ef2959e39c 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java @@ -182,7 +182,16 @@ public BoolQuery.Builder baseQueryBuilder() { // mlt?mlt.fl=:path&mlt.mindf=0&stream.body= . We need parse this query // string and turn into a query // elastic can understand. - bqb.must(m -> m.moreLikeThis(mltQuery(mltParams))); + String fields = mltParams.remove(MoreLikeThisHelperUtil.MLT_FILED); + if (fields == null || FieldNames.PATH.equals(fields)) { + for (String stf : elasticIndexDefinition.getSimilarityTagsFields()) { + Map shallowMltParams = new HashMap<>(MoreLikeThisHelperUtil.getParamMapFromMltQuery(stf)); + shallowMltParams.putAll(mltParams); + bqb.should(m -> m.moreLikeThis(mltQuery(shallowMltParams))); + } + } else { + bqb.must(m -> m.moreLikeThis(mltQuery(mltParams))); + } } else { bqb.must(m -> m.bool(similarityQuery(queryNodePath, sp))); } @@ -393,12 +402,7 @@ private MoreLikeThisQuery mltQuery(Map mltParams) { // We store path as the _id so no need to do anything extra here // We expect Similar impl to send a query where text would have evaluated to // node path. - mlt.like(l -> l.document(d -> d.id(id) - // this is needed to workaround https://github.com/elastic/elasticsearch/pull/94518 that causes empty - // results when the _ignored metadata field is part of the input document - .perFieldAnalyzer("_ignored", "keyword"))); - // when no fields are specified, we set the ones from the index definition - mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields())); + mlt.like(l -> l.document(d -> d.id(id))); } else { // This is for native queries if someone sends additional fields via // mlt.fl=field1,field2 diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java index c787d1e9eba..e5b25dd063e 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexQueryCommonTest.java @@ -52,7 +52,7 @@ public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws CommitFa String query = "explain select [jcr:path] from [nt:base] where " + "native('lucene', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')"; - String explainWithoutSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}"; + String explainWithoutSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}"; Tree test = root.getTree("/").addChild("test"); test.addChild("a").setProperty("text", "Hello World"); @@ -71,7 +71,7 @@ public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws CommitFa // similarity tags enabled, but no similarity tags properties configured, should not be present in the explain output assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, explainWithoutSimilarityTags, false)); - String explainWithSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}"; + String explainWithSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}"; Tree properties = indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties"); Tree simProp = TestUtil.enableForFullText(properties, "simProp", false); simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true); diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java index b4a51d6c515..3055007bfa1 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java @@ -131,8 +131,7 @@ public void repSimilarQueryWithLongPath() throws Exception { } /** - * Validates the workaround for 94518 produces the - * expected results + * This test checks 94518 issue. */ @Test public void repSimilarQueryWithIgnoredMetadataField() throws Exception { diff --git a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java index 9cd2d5edbed..9fb4cc6053e 100644 --- a/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java +++ b/oak-search/src/test/java/org/apache/jackrabbit/oak/plugins/index/DynamicBoostCommonTest.java @@ -175,6 +175,16 @@ public void dynamicBoostShouldNotMatchOnSingleFields() throws Exception { }); } + @Test + public void dynamicBoostedTagsShouldShouldBeUsedInSimilarityQueries() throws Exception { + boolean lite = areAnalyzeFeaturesSupportedInLiteModeOnly(); + createAssetsIndexAndProperties(lite, lite); + prepareTestAssets(); + + assertEventually(() -> assertOrderedQuery("select [jcr:path] from [dam:Asset] where similar(., '/test/asset1')", + List.of("/test/asset1", "/test/asset2", "/test/asset3"))); + } + protected abstract String getTestQueryDynamicBoostBasicExplained(); protected boolean areAnalyzeFeaturesSupportedInLiteModeOnly() {