Skip to content

Commit

Permalink
OAK-10146: oak-search-elastic: similarity search does not work for so…
Browse files Browse the repository at this point in the history
…me nodes (#873)

* OAK-10146: removed duplicate tests already part of oak-search common tests

* OAK-10146: workaround for elastic/elasticsearch#94518

* OAK-10146: consolidate usage of java 9/10 immutable data structures
  • Loading branch information
fabriziofortino committed Mar 21, 2023
1 parent fe33e8c commit 05a8563
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 161 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
import java.io.StringReader;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Base64;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -116,7 +115,7 @@ public class ElasticRequestHandler {
private static final Logger LOG = LoggerFactory.getLogger(ElasticRequestHandler.class);
private final static String SPELLCHECK_PREFIX = "spellcheck?term=";
protected final static String SUGGEST_PREFIX = "suggest?term=";
private static final List<SortOptions> DEFAULT_SORTS = Arrays.asList(
private static final List<SortOptions> DEFAULT_SORTS = List.of(
SortOptions.of(so -> so.field(f -> f.field("_score").order(SortOrder.Desc))),
SortOptions.of(so -> so.field(f -> f.field(FieldNames.PATH).order(SortOrder.Asc)))
);
Expand Down Expand Up @@ -389,11 +388,14 @@ private MoreLikeThisQuery mltQuery(Map<String, String> mltParams) {
// We store path as the _id so no need to do anything extra here
// We expect Similar impl to send a query where text would have evaluated to
// node path.
mlt.like(l -> l.document(d -> d.id(id)));
mlt.like(l -> l.document(d -> d.id(id)
// this is needed to workaround https://github.com/elastic/elasticsearch/pull/94518 that causes empty
// results when the _ignored metadata field is part of the input document
.perFieldAnalyzer("_ignored", "keyword")));
} else {
// This is for native queries if someone sends additional fields via
// mlt.fl=field1,field2
mlt.like(l -> l.document(d -> d.fields(Arrays.asList(fields.split(","))).id(id)));
mlt.like(l -> l.document(d -> d.fields(List.of(fields.split(","))).id(id)));
}
// include the input doc to align the Lucene behaviour TODO: add configuration
// parameter
Expand Down Expand Up @@ -424,7 +426,7 @@ private MoreLikeThisQuery mltQuery(Map<String, String> mltParams) {
mltParamSetter.accept(MoreLikeThisHelperUtil.MLT_STOP_WORDS, (val) -> {
// TODO : Read this from a stopwords text file, configured via index defn maybe
// ?
mlt.stopWords(Arrays.asList(val.split(",")));
mlt.stopWords(List.of(val.split(",")));
});

if (!shallowMltParams.isEmpty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,10 @@

import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.net.URI;
import java.nio.charset.Charset;
import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
Expand All @@ -48,6 +45,7 @@
import java.util.Random;
import java.util.UUID;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
Expand All @@ -56,79 +54,6 @@

public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest {

/*
This test mirror the test org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarAsNativeQuery
Exact same test data, to test out for feature parity
The only difference is the same query in lucene returns the doc itself (the one that we need similar docs of) as part of search results
whereas in elastic, it doesn't.
*/
@Test
public void repSimilarAsNativeQuery() throws Exception {

createIndex(true);

String nativeQueryString = "select [jcr:path] from [nt:base] where " +
"native('elastic-sim', 'mlt?stream.body=/test/c&mlt.fl=:path&mlt.mindf=0&mlt.mintf=0')";
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World");
test.addChild("b").setProperty("text", "He said Hello and then the world said Hello as well.");
test.addChild("c").setProperty("text", "He said Hi.");
root.commit();

assertEventually(() -> assertQuery(nativeQueryString, Arrays.asList("/test/c", "/test/b")));
}

/*
This test mirror the test org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarQuery
Exact same test data, to test out for feature parity
The only difference is the same query in lucene returns the doc itself (the one that we need similar docs of) as part of search results
whereas in elastic, it doesn't.
*/
@Test
public void repSimilarQuery() throws Exception {
createIndex(false);

String query = "select [jcr:path] from [nt:base] where similar(., '/test/a')";
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World Hello World");
test.addChild("b").setProperty("text", "Hello World");
test.addChild("c").setProperty("text", "World");
test.addChild("d").setProperty("text", "Hello");
test.addChild("e").setProperty("text", "Bye Bye");
test.addChild("f").setProperty("text", "Hello");
test.addChild("g").setProperty("text", "World");
test.addChild("h").setProperty("text", "Hello");
root.commit();

assertEventually(() -> assertQuery(query,
Arrays.asList("/test/a", "/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
}

/*
This test mirror the test org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexQueryTest#testRepSimilarXPathQuery
Exact same test data, to test out for feature parity
The only difference is the same query in lucene returns the doc itself (the one that we need similar docs of) as part of search results
whereas in elastic, it doesn't.
*/
@Test
public void repSimilarXPathQuery() throws Exception {
createIndex(false);

String query = "//element(*, nt:base)[rep:similar(., '/test/a')]";
Tree test = root.getTree("/").addChild("test");
test.addChild("a").setProperty("text", "Hello World Hello World");
test.addChild("b").setProperty("text", "Hello World");
test.addChild("c").setProperty("text", "World");
test.addChild("d").setProperty("text", "Hello");
test.addChild("e").setProperty("text", "Bye Bye");
test.addChild("f").setProperty("text", "Hello");
test.addChild("g").setProperty("text", "World");
test.addChild("h").setProperty("text", "Hello");
root.commit();
assertEventually(() -> assertQuery(query, XPATH,
Arrays.asList("/test/a", "/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
}

@Test
public void repSimilarWithStopWords() throws Exception {
createIndex(true);
Expand All @@ -150,11 +75,10 @@ public void repSimilarWithStopWords() throws Exception {
root.commit();

// Matches due to terms Hello or bye should be ignored
assertEventually(() -> assertQuery(nativeQueryStringWithStopWords,
Arrays.asList("/test/a", "/test/e", "/test/f")));
assertEventually(() -> assertQuery(nativeQueryStringWithStopWords, List.of("/test/a", "/test/e", "/test/f")));

assertEventually(() -> assertQuery(nativeQueryStringWithoutStopWords,
Arrays.asList("/test/a", "/test/b", "/test/c", "/test/d", "/test/e", "/test/f")));
List.of("/test/a", "/test/b", "/test/c", "/test/d", "/test/e", "/test/f")));
}

@Test
Expand All @@ -175,11 +99,10 @@ public void repSimilarWithMinWordLength() throws Exception {

// Matches because of term Hello should be ignored since wl <6 (so /test/ should NOT be in the match list)
// /test/d should be in match list (because of Worlds term)
assertEventually(() -> assertQuery(nativeQueryStringWithMinWordLength,
Arrays.asList("/test/a", "/test/c", "/test/d")));
assertEventually(() -> assertQuery(nativeQueryStringWithMinWordLength, List.of("/test/a", "/test/c", "/test/d")));

assertEventually(() -> assertQuery(nativeQueryStringWithoutMinWordLength,
Arrays.asList("/test/a", "/test/b", "/test/c", "/test/d")));
List.of("/test/a", "/test/b", "/test/c", "/test/d")));
}

@Test
Expand All @@ -204,7 +127,25 @@ public void repSimilarQueryWithLongPath() throws Exception {
String query = "select [jcr:path] from [nt:base] where similar(., '" + p + "')";

assertEventually(() -> assertQuery(query,
Arrays.asList(p, "/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
List.of(p, "/test/b", "/test/c", "/test/d", "/test/f", "/test/g", "/test/h")));
}

/**
* Validates the workaround for <a href="https://github.com/elastic/elasticsearch/pull/94518">94518</a> produces the
* expected results
*/
@Test
public void repSimilarQueryWithIgnoredMetadataField() throws Exception {
createIndex(false);
Tree test = root.getTree("/").addChild("test");

// the max keyword length is 256, this field will be then listed as _ignored
test.addChild("a").setProperty("text", ElasticTestUtils.randomString(1000));
root.commit();

String query = "select [jcr:path] from [nt:base] where similar(., '/test/a')";

assertEventually(() -> assertQuery(query, List.of("/test/a")));
}

@Test
Expand Down Expand Up @@ -278,7 +219,7 @@ public void vectorSimilarityWithWrongVectorSizes() throws Exception {

for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
List<Double> values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
byte[] bytes = toByteArray(values);
List<Double> actual = toDoubles(bytes);
assertEquals(values, actual);
Expand All @@ -305,10 +246,10 @@ public void vectorSimilarity() throws Exception {
URI uri = getClass().getResource("/org/apache/jackrabbit/oak/query/fvs.csv").toURI();
File file = new File(uri);

Collection<String> children = new LinkedList<>();
List<String> children = new LinkedList<>();
for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) {
String[] split = line.split(",");
List<Double> values = Arrays.stream(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
List<Double> values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList());
byte[] bytes = toByteArray(values);
List<Double> actual = toDoubles(bytes);
assertEquals(values, actual);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@

import co.elastic.clients.transport.Version;
import com.github.dockerjava.api.DockerClient;
import com.google.common.collect.ImmutableMap;
import org.apache.jackrabbit.oak.commons.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -42,15 +41,15 @@

public class ElasticTestServer implements AutoCloseable {
private static final Logger LOG = LoggerFactory.getLogger(ElasticTestServer.class);
private static final Map<String, String> PLUGIN_OFFICIAL_RELEASES_DIGEST_MAP = ImmutableMap.<String, String>builder()
.put("7.17.3.0", "5e3b40bb72b2813f927be9bf6ecdf88668d89d2ef20c7ebafaa51ab8407fd179")
.put("7.17.6.0", "326893bb98ef1a0c569d9f4c4a9a073e53361924f990b17e87077985ce8a7478")
.put("7.17.7.0", "4252eb55cc7775f1b889d624ac335abfa2e357931c40d0accb4d520144246b8b")
.put("8.3.3.0", "14d3223456f4b9f00f86628ec8400cb46513935e618ae0f5d0d1088739ccc233")
.put("8.4.1.0", "56797a1bac6ceeaa36d2358f818b14633124d79c5e04630fa3544603d82eaa01")
.put("8.4.2.0", "5ce81ad043816900a496ad5b3cce7de1d99547ebf92aa1f9856343e48580c71c")
.put("8.4.3.0", "5c00d43cdd56c5c5d8e9032ad507acea482fb5ca9445861c5cc12ad63af66425")
.build();
private static final Map<String, String> PLUGIN_OFFICIAL_RELEASES_DIGEST_MAP = Map.of(
"7.17.3.0", "5e3b40bb72b2813f927be9bf6ecdf88668d89d2ef20c7ebafaa51ab8407fd179",
"7.17.6.0", "326893bb98ef1a0c569d9f4c4a9a073e53361924f990b17e87077985ce8a7478",
"7.17.7.0", "4252eb55cc7775f1b889d624ac335abfa2e357931c40d0accb4d520144246b8b",
"8.3.3.0", "14d3223456f4b9f00f86628ec8400cb46513935e618ae0f5d0d1088739ccc233",
"8.4.1.0", "56797a1bac6ceeaa36d2358f818b14633124d79c5e04630fa3544603d82eaa01",
"8.4.2.0", "5ce81ad043816900a496ad5b3cce7de1d99547ebf92aa1f9856343e48580c71c",
"8.4.3.0", "5c00d43cdd56c5c5d8e9032ad507acea482fb5ca9445861c5cc12ad63af66425",
"8.5.3.0", "d4c13f68650f9df5ff8c74ec83abc2e416de9c45f991d459326e0e2baf7b0e3f");

private static final ElasticTestServer SERVER = new ElasticTestServer();
private static volatile ElasticsearchContainer CONTAINER;
Expand Down

0 comments on commit 05a8563

Please sign in to comment.