Skip to content
This repository has been archived by the owner on May 27, 2020. It is now read-only.

complex analyzer builder analyzer without requiring a custom jar addition #306

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -29,7 +29,8 @@
*/
@JsonTypeInfo(use = JsonTypeInfo.Id.NAME, include = JsonTypeInfo.As.PROPERTY, property = "type")
@JsonSubTypes({@JsonSubTypes.Type(value = ClasspathAnalyzerBuilder.class, name = "classpath"),
@JsonSubTypes.Type(value = SnowballAnalyzerBuilder.class, name = "snowball")})
@JsonSubTypes.Type(value = SnowballAnalyzerBuilder.class, name = "snowball"),
@JsonSubTypes.Type(value = ComplexAnalyzerBuilder.class, name = "complex")})
public abstract class AnalyzerBuilder {

/**
Expand Down
@@ -0,0 +1,152 @@
/*
* Licensed to STRATIO (C) under one or more contributor license agreements.
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership. The STRATIO (C) licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.stratio.cassandra.lucene.schema.analysis;

import com.stratio.cassandra.lucene.IndexException;
import org.apache.lucene.analysis.util.CharArraySet;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;

import java.lang.reflect.Constructor;
import java.lang.reflect.Modifier;
import java.util.List;
import java.util.function.Function;
import java.util.stream.Stream;

import static java.util.Arrays.asList;
import static java.util.Locale.ENGLISH;
import static java.util.stream.Collectors.toList;

/**
* Build an instance of a class based on the constructor matching the same number of parameters.
* The coercion of parameters only supports simple types like String, int, long.
*/
public class ClassFactoryBuilder {
@JsonProperty("class")
private final String className;

@JsonProperty("parameters")
private final String[] parameters;

@JsonCreator
public ClassFactoryBuilder(@JsonProperty("class") String className, @JsonProperty("parameters") String[] parameters) {
this.className = replaceAlias(className);
this.parameters = parameters;
}

private String replaceAlias(final String className) {
// support edge-ngram or edge_ngram or edgengram styles depending what is the more readable for the user
switch (className.replace("_", "").replace("-", "").toLowerCase(ENGLISH)) {
// tokenizers
case "ngram":
return "org.apache.lucene.analysis.ngram.NGramTokenizer";
case "edgengram":
return "org.apache.lucene.analysis.ngram.EdgeNGramTokenizer";
case "pattern":
return "org.apache.lucene.analysis.pattern.PatternTokenizer";
case "classic":
return "org.apache.lucene.analysis.standard.ClassicTokenizer";
case "keyword":
return "org.apache.lucene.analysis.core.KeywordTokenizer";
// filters
case "limitcount":
return "org.apache.lucene.analysis.miscellaneous.LimitTokenCountFilter";
case "limitoffset":
return "org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter";
case "limitposition":
return "org.apache.lucene.analysis.miscellaneous.LimitTokenPositionFilter";
case "edgengramfilter":
return "org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter";
case "ngramfilter":
return "org.apache.lucene.analysis.ngram.NGramTokenFilter";
case "kstemfilter":
return "org.apache.lucene.analysis.en.KStemFilter";
case "shingle":
return "org.apache.lucene.analysis.shingle.ShingleFilter";
case "trim":
return "org.apache.lucene.analysis.miscellaneous.TrimFilter";
case "stop":
return "org.apache.lucene.analysis.core.StopFilter";
case "lower":
return "org.apache.lucene.analysis.core.LowerCaseFilter";
case "standard":
return "org.apache.lucene.analysis.standard.StandardFilter";
case "numericpayload":
return "org.apache.lucene.analysis.payloads.NumericPayloadTokenFilter";
default:
return className;
}
}

public <T> T build(final Class<T> expected, final Function<Class<?>, Object> valueProvider) {
try {
final Class<?> impl = Class.forName(className);
if (!expected.isAssignableFrom(impl)) {
throw new IndexException("'%s' doesn't implement '%s'", className, expected.getName());
}

if (parameters == null || parameters.length == 0) {
return expected.cast(impl.getConstructor().newInstance());
}

final List<Constructor<?>> collect = Stream.of(impl.getConstructors())
.filter(c -> c.getParameterCount() == parameters.length && Modifier.isPublic(c.getModifiers()))
.collect(toList());
if (collect.isEmpty()) {
throw new IndexException("No constructor with %s parameters in '%s'", parameters.length, className);
}
if (collect.size() > 1) {
throw new IndexException("Ambiguous constructor with %s parameters in '%s'", parameters.length, className);
}

final Object[] args = new Object[parameters.length];
final Constructor<?> next = collect.iterator().next();
final Class<?>[] types = next.getParameterTypes();
for (int i = 0; i < args.length; i++) {
if (types[i] == int.class) {
args[i] = Integer.parseInt(parameters[i]);
} else if (types[i] == long.class) {
args[i] = Long.parseLong(parameters[i]);
} else if (types[i] == short.class) {
args[i] = Short.parseShort(parameters[i]);
} else if (types[i] == boolean.class) {
args[i] = Boolean.parseBoolean(parameters[i]);
} else if (types[i] == float.class) {
args[i] = Float.parseFloat(parameters[i]);
} else if (types[i] == String.class) {
args[i] = parameters[i];
} else if (types[i] == CharArraySet.class) {
args[i] = new CharArraySet(asList(parameters[i].split(",")), true);
} else {
if (valueProvider != null) {
final Object value = valueProvider.apply(types[i]);
if (value != null) {
args[i] = value;
continue;
}
}
throw new IndexException("Unsupported constructor parameter type '%s' for '%s'", types[i].getName(), className);
}
}

return expected.cast(next.newInstance(args));
} catch (final Exception e) {
throw new IndexException(e, "Didn't find '%s'", className);
}
}
}
@@ -0,0 +1,95 @@
/*
* Licensed to STRATIO (C) under one or more contributor license agreements.
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership. The STRATIO (C) licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package com.stratio.cassandra.lucene.schema.analysis;

import com.stratio.cassandra.lucene.IndexException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.codehaus.jackson.annotate.JsonCreator;
import org.codehaus.jackson.annotate.JsonProperty;

import java.util.Collections;
import java.util.List;

/**
* {@link AnalyzerBuilder} for building {@link Analyzer}s based on an advanced configuration.
*/
public class ComplexAnalyzerBuilder extends AnalyzerBuilder {
/**
* The tokenizer to use to build this analyzer.
*/
@JsonProperty("tokenizer")
private final ClassFactoryBuilder tokenizer;

/**
* The token streams to use, potentially wrapping each others. Use any placeholder in parameters to replace the previous instance.
*/
@JsonProperty("token_streams")
private final List<ClassFactoryBuilder> tokenStreams;

@JsonCreator
public ComplexAnalyzerBuilder(@JsonProperty("tokenizer") ClassFactoryBuilder className,
@JsonProperty("token_streams") List<ClassFactoryBuilder> tokenStreams) {
this.tokenizer = className;
this.tokenStreams = tokenStreams;

}

/**
* {@inheritDoc}
*/
@Override
public Analyzer analyzer() {
try {
final Tokenizer tokenizer = this.tokenizer.build(Tokenizer.class, null);
TokenStream tokenStream = tokenizer;
if (tokenStreams != null) {
Collections.reverse(tokenStreams);
for (final ClassFactoryBuilder builder : tokenStreams) {
final TokenStream previous = tokenStream;
tokenStream = builder.build(TokenStream.class, type -> {
if (type == TokenStream.class) {
return previous;
}
return null;
});
}
}
return new ComplexAnalyzer(tokenizer, tokenStream);
} catch (final Exception e) {
throw new IndexException(e);
}
}

public static class ComplexAnalyzer extends Analyzer {
private final Tokenizer tokenizer;
private final TokenStream stream;

private ComplexAnalyzer(final Tokenizer tokenizer, final TokenStream tokenStream) {
this.tokenizer = tokenizer;
this.stream = tokenStream;
}

@Override
protected TokenStreamComponents createComponents(final String s) {
return new TokenStreamComponents(tokenizer, stream);
}
}
}
Expand Up @@ -19,18 +19,54 @@
package com.stratio.cassandra.lucene.schema;

import com.stratio.cassandra.lucene.IndexException;
import com.stratio.cassandra.lucene.schema.analysis.ComplexAnalyzerBuilder;
import com.stratio.cassandra.lucene.schema.analysis.SnowballAnalyzerBuilder.SnowballAnalyzer;
import com.stratio.cassandra.lucene.schema.analysis.StandardAnalyzers;
import com.stratio.cassandra.lucene.schema.mapping.*;
import com.stratio.cassandra.lucene.schema.mapping.BigDecimalMapper;
import com.stratio.cassandra.lucene.schema.mapping.BigIntegerMapper;
import com.stratio.cassandra.lucene.schema.mapping.BitemporalMapper;
import com.stratio.cassandra.lucene.schema.mapping.BlobMapper;
import com.stratio.cassandra.lucene.schema.mapping.BooleanMapper;
import com.stratio.cassandra.lucene.schema.mapping.DateMapper;
import com.stratio.cassandra.lucene.schema.mapping.DateRangeMapper;
import com.stratio.cassandra.lucene.schema.mapping.DoubleMapper;
import com.stratio.cassandra.lucene.schema.mapping.FloatMapper;
import com.stratio.cassandra.lucene.schema.mapping.GeoPointMapper;
import com.stratio.cassandra.lucene.schema.mapping.InetMapper;
import com.stratio.cassandra.lucene.schema.mapping.IntegerMapper;
import com.stratio.cassandra.lucene.schema.mapping.LongMapper;
import com.stratio.cassandra.lucene.schema.mapping.Mapper;
import com.stratio.cassandra.lucene.schema.mapping.StringMapper;
import com.stratio.cassandra.lucene.schema.mapping.TextMapper;
import com.stratio.cassandra.lucene.schema.mapping.UUIDMapper;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.es.SpanishAnalyzer;
import org.junit.Test;

import java.io.IOException;

import static com.stratio.cassandra.lucene.schema.SchemaBuilders.*;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.bigDecimalMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.bigIntegerMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.bitemporalMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.blobMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.booleanMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.classpathAnalyzer;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.dateMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.dateRangeMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.doubleMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.floatMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.geoPointMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.inetMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.integerMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.longMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.schema;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.snowballAnalyzer;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.stringMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.textMapper;
import static com.stratio.cassandra.lucene.schema.SchemaBuilders.uuidMapper;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;

/**
Expand Down Expand Up @@ -296,6 +332,36 @@ public void testFromJSONWithEmptyAnalyzers() throws IOException {
schema.close();
}

@Test
public void testFromJSONWithComplexAnalyzer() throws IOException {
final String complexJson = "{" +
"type:\"complex\"," +
"tokenizer:{\"class\":\"ngram\", \"parameters\":[\"1\",\"2\"]}," +
"token_streams:[" +
" {" +
"\"class\":\"stop\"," +
"\"parameters\":[null, \"a,an,and,are,as,at,be,but,by,for,if,in,into,is,it,no,not,of,on," +
"or,such,that,the,their,then,there,these,they,this,to,was,will,with\"]" +
" }," +
" {" +
"\"class\":\"org.apache.lucene.analysis.core.LowerCaseFilter\"," +
"\"parameters\":[null]" +
" }," +
" {" +
"\"class\":\"org.apache.lucene.analysis.standard.StandardFilter\"," +
"\"parameters\":[null]" +
" }" +
"]}";
final String json = "{analyzers:{\"customandcomplex\":" + complexJson + "}, default_analyzer : \"customandcomplex\" }'";
final Schema schema = SchemaBuilder.fromJson(json).build();

final Analyzer defaultAnalyzer = schema.getDefaultAnalyzer();
assertNotNull(defaultAnalyzer);
assertTrue(ComplexAnalyzerBuilder.ComplexAnalyzer.class.isInstance(defaultAnalyzer));

schema.close();
}

@Test
public void testParseJSONWithNullDefaultAnalyzer() throws IOException {

Expand Down