Skip to content

Commit

Permalink
OAK-10762: dynamic boosted tags need to specify extra parameters (#1416)
Browse files Browse the repository at this point in the history
* OAK-10762: dynamic boosted tags need to specify extra parameters

* OAK-10762: remove workaround for elastic/elasticsearch#94518

* OAK-10762: add todo on how to further improve relevance

* OAK-10762: fix failing tests

* OAK-10762: typo

* OAK-10762: improved todo comment
  • Loading branch information
fabriziofortino committed Apr 23, 2024
1 parent 3abb2b8 commit 931dfff
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 14 deletions.
Expand Up @@ -118,10 +118,13 @@ public class ElasticIndexDefinition extends IndexDefinition {
// MLT queries, when no fields are specified, do not use the entire document but only a maximum of
// max_query_terms (default 25). Even increasing this value, the query could produce not so relevant
// results (eg: based on the :fulltext content). To work this around, we can specify DYNAMIC_BOOST_FULLTEXT
// field as first field since it usually contains relevant terms. This will make sure that the MLT queries
// give more priority to the terms in this field while the rest (*) are considered secondary.
// field with overridden mlt params and increased boost since it usually contains relevant terms. This will make sure
// that the MLT queries give more priority to the terms in this field while the rest (*) are considered secondary.
// TODO: we can further improve search relevance by using the actual tags combined with the weights using a function query.
// Right now, we are just matching the tags without looking at the weights. Therefore, a tag can be matched in a field with a lower weight.
private static final String[] SIMILARITY_TAGS_FIELDS_DEFAULT = new String[] {
DYNAMIC_BOOST_FULLTEXT, "*"
"mlt.fl=" + DYNAMIC_BOOST_FULLTEXT + "&mlt.mintf=1&mlt.qf=3",
"mlt.fl=*&mlt.mintf=2"
};

private static final String SIMILARITY_TAGS_BOOST = "similarityTagsBoost";
Expand Down
Expand Up @@ -182,7 +182,16 @@ public BoolQuery.Builder baseQueryBuilder() {
// mlt?mlt.fl=:path&mlt.mindf=0&stream.body=<path> . We need parse this query
// string and turn into a query
// elastic can understand.
bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
String fields = mltParams.remove(MoreLikeThisHelperUtil.MLT_FILED);
if (fields == null || FieldNames.PATH.equals(fields)) {
for (String stf : elasticIndexDefinition.getSimilarityTagsFields()) {
Map<String, String> shallowMltParams = new HashMap<>(MoreLikeThisHelperUtil.getParamMapFromMltQuery(stf));
shallowMltParams.putAll(mltParams);
bqb.should(m -> m.moreLikeThis(mltQuery(shallowMltParams)));
}
} else {
bqb.must(m -> m.moreLikeThis(mltQuery(mltParams)));
}
} else {
bqb.must(m -> m.bool(similarityQuery(queryNodePath, sp)));
}
Expand Down Expand Up @@ -393,12 +402,7 @@ private MoreLikeThisQuery mltQuery(Map<String, String> mltParams) {
// We store path as the _id so no need to do anything extra here
// We expect Similar impl to send a query where text would have evaluated to
// node path.
mlt.like(l -> l.document(d -> d.id(id)
// this is needed to workaround https://github.com/elastic/elasticsearch/pull/94518 that causes empty
// results when the _ignored metadata field is part of the input document
.perFieldAnalyzer("_ignored", "keyword")));
// when no fields are specified, we set the ones from the index definition
mlt.fields(Arrays.asList(elasticIndexDefinition.getSimilarityTagsFields()));
mlt.like(l -> l.document(d -> d.id(id)));
} else {
// This is for native queries if someone sends additional fields via
// mlt.fl=field1,field2
Expand Down
Expand Up @@ -52,7 +52,7 @@ public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws CommitFa
String query = "explain select [jcr:path] from [nt:base] where " +
"native('lucene', 'mlt?stream.body=/test/a&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";

String explainWithoutSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";
String explainWithoutSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";

Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World");
Expand All @@ -71,7 +71,7 @@ public void similarityQueryShouldCorrectlyHandleSimilarityTags() throws CommitFa
// similarity tags enabled, but no similarity tags properties configured, should not be present in the explain output
assertEventually(getAssertionForExplain(query, Query.JCR_SQL2, explainWithoutSimilarityTags, false));

String explainWithSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"must\":[{\"more_like_this\":{\"fields\":[\":dynamic-boost-ft\",\"*\"],\"include\":true,\"like\":[{\"_id\":\"/test/a\",\"per_field_analyzer\":{\"_ignored\":\"keyword\"}}],\"min_doc_freq\":0,\"min_term_freq\":0}}],\"should\":[{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";
String explainWithSimilarityTags = "{\"_source\":{\"includes\":[\":path\"]},\"query\":{\"bool\":{\"should\":[{\"more_like_this\":{\"boost\":3.0,\"include\":true,\"like\":[{\"fields\":[\":dynamic-boost-ft\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"include\":true,\"like\":[{\"fields\":[\"*\"],\"_id\":\"/test/a\"}],\"min_doc_freq\":0,\"min_term_freq\":0}},{\"more_like_this\":{\"boost\":0.5,\"fields\":[\":simTags\"],\"like\":[{\"_id\":\"/test/a\"}],\"min_doc_freq\":1,\"min_term_freq\":1}}]}},\"size\":10,\"sort\":[{\"_score\":{\"order\":\"desc\"}},{\":path\":{\"order\":\"asc\"}}],\"track_total_hits\":10000}";
Tree properties = indexDefn.getChild(FulltextIndexConstants.INDEX_RULES).getChild("nt:base").getChild("properties");
Tree simProp = TestUtil.enableForFullText(properties, "simProp", false);
simProp.setProperty(FulltextIndexConstants.PROP_SIMILARITY_TAGS, true);
Expand Down
Expand Up @@ -131,8 +131,7 @@ public void repSimilarQueryWithLongPath() throws Exception {
}

/**
* Validates the workaround for <a href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> produces the
* expected results
* This test checks <a href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> issue.
*/
@Test
public void repSimilarQueryWithIgnoredMetadataField() throws Exception {
Expand Down
Expand Up @@ -175,6 +175,16 @@ public void dynamicBoostShouldNotMatchOnSingleFields() throws Exception {
});
}

@Test
public void dynamicBoostedTagsShouldShouldBeUsedInSimilarityQueries() throws Exception {
boolean lite = areAnalyzeFeaturesSupportedInLiteModeOnly();
createAssetsIndexAndProperties(lite, lite);
prepareTestAssets();

assertEventually(() -> assertOrderedQuery("select [jcr:path] from [dam:Asset] where similar(., '/test/asset1')",
List.of("/test/asset1", "/test/asset2", "/test/asset3")));
}

protected abstract String getTestQueryDynamicBoostBasicExplained();

protected boolean areAnalyzeFeaturesSupportedInLiteModeOnly() {
Expand Down

0 comments on commit 931dfff

Please sign in to comment.