Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Allow exceeded term threshold to execute #2358

Draft
wants to merge 2 commits into
base: integration
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -77,7 +77,7 @@
*/
public class ShardQueryConfiguration extends GenericQueryConfiguration implements Serializable, CheckpointableQueryConfiguration {

public static final String PARAM_VALUE_SEP_STR = new String(new char[] {Constants.PARAM_VALUE_SEP});
public static final String PARAM_VALUE_SEP_STR = String.valueOf(Constants.PARAM_VALUE_SEP);
public static final String TABLE_NAME_SOURCE = "tableName";
public static final String QUERY_LOGIC_NAME_SOURCE = "queryLogic";

Expand Down Expand Up @@ -857,7 +857,7 @@ public static ShardQueryConfiguration create(ShardQueryLogic shardQueryLogic, Qu

/**
* A convenience method that determines whether we can handle when we have exceeded the value threshold on some node. We can handle this if the Ivarators
* can be used which required a hadoop config and a base hdfs cache directory.
* can be used which require a hadoop config and a base hdfs cache directory.
*
* @return if we can handle the exceeded value
*/
Expand All @@ -866,12 +866,13 @@ public boolean canHandleExceededValueThreshold() {
}

/**
* A convenience method that determines whether we can handle when we have exceeded the term threshold on some node. Currently we cannot.
* A convenience method that determines whether we can handle when we have exceeded the term threshold on some node. We can handle this if the Ivarators can
* be used which require a hadoop config and a base hdfs cache directory.
*
* @return if we can handle exceeding the term threshold
*/
public boolean canHandleExceededTermThreshold() {
return false;
return this.hdfsSiteConfigURLs != null && (null != this.ivaratorCacheDirConfigs && !this.ivaratorCacheDirConfigs.isEmpty());
}

public String getShardTableName() {
Expand Down
Expand Up @@ -19,7 +19,7 @@
*
*/
public class IndexRegexIteratorBuilder extends IvaratorBuilder implements IteratorBuilder {
private static Logger log = Logger.getLogger(IndexRegexIteratorBuilder.class);
private static final Logger log = Logger.getLogger(IndexRegexIteratorBuilder.class);

protected Boolean negated;

Expand Down Expand Up @@ -107,9 +107,7 @@ public NestedIterator<Key> build() {
timeFilter = null;
datatypeFilter = null;
keyTform = null;
timeFilter = null;
ivaratorCacheDirs = null;
field = null;
node = null;
return itr;
} else {
Expand Down
Expand Up @@ -967,12 +967,15 @@ public Object visit(ASTOrNode node, Object data) {
public Object visit(ASTAndNode node, Object data) {
STATE state;
QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(node);
// until we implement an ivarator that can handle an ExceededTermThreshold node, and ensure that the JexlContext gets
// _ANYFIELD_ values, then we cannot execute these nodes
// if exceeded term threshold and index only, ivarate - otherwise set as non-executable
if (instance.isType(EXCEEDED_TERM)) {
state = STATE.NON_EXECUTABLE;
if (output != null) {
output.writeLine(data, node, "( Exceeded Term Threshold )", state, true);
if (isIndexOnly(node)) {
state = STATE.EXECUTABLE;
} else {
state = STATE.NON_EXECUTABLE;
if (output != null) {
output.writeLine(data, node, "( Exceeded Term Threshold )", state, true);
}
}
}
// if an ivarator then return true, else check out children
Expand Down
Expand Up @@ -4,6 +4,7 @@
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.DROPPED;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EVALUATION_ONLY;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_OR;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_TERM;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_VALUE;
import static org.apache.commons.jexl3.parser.JexlNodes.setChildren;

Expand Down Expand Up @@ -273,12 +274,41 @@ public Object visit(ASTAndNode and, Object data) {
// index checking has already been done, otherwise we would not have
// an "ExceededValueThresholdMarker"
// hence the "IndexAgnostic" method can be used here
LiteralRange range = JexlASTHelper.findRange().recursively().getRange(and);
LiteralRange<?> range = JexlASTHelper.findRange().recursively().getRange(and);
if (range == null) {
QueryException qe = new QueryException(DatawaveErrorCode.MULTIPLE_RANGES_IN_EXPRESSION);
throw new DatawaveFatalQueryException(qe);
}
((IndexRangeIteratorBuilder) data).setRange(range);
} else if (instance.isType(EXCEEDED_TERM)) {
// since an ExceededTermThresholdMarker can stand in place of a regex OR a range we have to cover both cases
JexlNode source = instance.getSource();
if (source instanceof ASTAndNode) { // the case for a range
try {
// @formatter:off
List<ASTFunctionNode> functionNodes = JexlASTHelper
.getFunctionNodes(source)
.stream()
.filter(node -> JexlFunctionArgumentDescriptorFactory.F.getArgumentDescriptor(node).allowIvaratorFiltering())
.collect(Collectors.toList());
// @formatter:on
if (functionNodes.isEmpty()) {
ivarateRange(and, source, data);
}
} catch (IOException ioe) {
throw new DatawaveFatalQueryException("Unable to ivarate", ioe);
}
} else if (source instanceof ASTERNode || source instanceof ASTNRNode) { // the case for a regex
try {
ivarateRegex(and, source, data);
} catch (IOException ioe) {
throw new DatawaveFatalQueryException("Unable to ivarate", ioe);
}
} else {
QueryException qe = new QueryException(DatawaveErrorCode.UNEXPECTED_SOURCE_NODE,
MessageFormat.format("{0}", "Limited ExceededTermThresholdMarkerJexlNode"));
throw new DatawaveFatalQueryException(qe);
}
} else if (instance.isType(EXCEEDED_VALUE)) {
// if the parent is our ExceededValueThreshold marker, then use an
// Ivarator to get the job done unless we don't have to
Expand Down Expand Up @@ -337,7 +367,7 @@ public Object visit(ASTAndNode and, Object data) {
if (termFrequencyFields.contains(identifier)) {
nested = buildExceededFromTermFrequency(identifier, and, source, range, data);
} else {
/**
/*
* This is okay since 1) We are doc specific 2) We are not index only or tf 3) Therefore, we must evaluate against the document for this
* expression 4) Return a stubbed range in case we have a disjunction that breaks the current doc.
*/
Expand Down Expand Up @@ -653,7 +683,7 @@ public Object visit(ASTEQNode node, Object data) {
throw new RuntimeException(e);
}

/**
/*
* If we have an unindexed type enforced, we've been configured to assert whether the field is indexed.
*/
if (isUnindexed(node)) {
Expand Down Expand Up @@ -923,7 +953,7 @@ protected Object visitDelayedIndexOnly(ASTEQNode node, Object data) {
throw new RuntimeException(e);
}

/**
/*
* If we have an unindexed type enforced, we've been configured to assert whether the field is indexed.
*/
if (isUnindexed(node)) {
Expand Down Expand Up @@ -1187,7 +1217,7 @@ public LiteralRange<?> buildLiteralRange(JexlNode source, Object data) {
// "ExceededValueThresholdMarker"
// hence the "IndexAgnostic" method can be used here
if (source instanceof ASTAndNode) {
LiteralRange range = JexlASTHelper.findRange().recursively().getRange(source);
LiteralRange<?> range = JexlASTHelper.findRange().recursively().getRange(source);
if (range == null) {
QueryException qe = new QueryException(DatawaveErrorCode.MULTIPLE_RANGES_IN_EXPRESSION);
throw new DatawaveFatalQueryException(qe);
Expand Down
Expand Up @@ -2,6 +2,7 @@

import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EVALUATION_ONLY;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_OR;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_TERM;
import static datawave.query.jexl.nodes.QueryPropertyMarker.MarkerType.EXCEEDED_VALUE;

import org.apache.commons.jexl3.parser.ASTAndNode;
Expand All @@ -10,10 +11,10 @@
import datawave.query.jexl.nodes.QueryPropertyMarker;

/**
* A visitor that checks the query tree to determine if the query requires an ivarator (ExceededValue or ExceededOr)
* A visitor that checks the query tree to determine if the query requires an ivarator (ExceededValue, ExceededTerm, or ExceededOr)
*
*/
public class IvaratorRequiredVisitor extends BaseVisitor {
public class IvaratorRequiredVisitor extends ShortCircuitBaseVisitor {

private boolean ivaratorRequired = false;

Expand All @@ -32,7 +33,7 @@ public Object visit(ASTAndNode and, Object data) {
QueryPropertyMarker.Instance instance = QueryPropertyMarker.findInstance(and);
if (instance.isType(EVALUATION_ONLY)) {
return data;
} else if (instance.isAnyTypeOf(EXCEEDED_OR, EXCEEDED_VALUE)) {
} else if (instance.isAnyTypeOf(EXCEEDED_OR, EXCEEDED_VALUE, EXCEEDED_TERM)) {
ivaratorRequired = true;
} else if (!instance.isAnyTypeOf()) {
super.visit(and, data);
Expand Down
Expand Up @@ -1733,7 +1733,7 @@ private void loadDataTypeMetadata(Multimap<String,Type<?>> fieldToDatatypeMap, S
}

protected Set<String> upcase(Set<String> fields) {
return fields.stream().map(s -> s.toUpperCase()).collect(Collectors.toSet());
return fields.stream().map(String::toUpperCase).collect(Collectors.toSet());
}

protected ASTJexlScript upperCaseIdentifiers(MetadataHelper metadataHelper, ShardQueryConfiguration config, ASTJexlScript script) {
Expand All @@ -1758,12 +1758,12 @@ protected ASTJexlScript upperCaseIdentifiers(MetadataHelper metadataHelper, Shar

UniqueFields uniqueFields = config.getUniqueFields();
if (uniqueFields != null && !uniqueFields.isEmpty()) {
Sets.newHashSet(uniqueFields.getFields()).stream().forEach(s -> uniqueFields.replace(s, s.toUpperCase()));
Sets.newHashSet(uniqueFields.getFields()).forEach(s -> uniqueFields.replace(s, s.toUpperCase()));
}

ExcerptFields excerptFields = config.getExcerptFields();
if (excerptFields != null && !excerptFields.isEmpty()) {
Sets.newHashSet(excerptFields.getFields()).stream().forEach(s -> excerptFields.replace(s, s.toUpperCase()));
Sets.newHashSet(excerptFields.getFields()).forEach(s -> excerptFields.replace(s, s.toUpperCase()));
}

Set<String> userProjection = config.getRenameFields();
Expand Down Expand Up @@ -2771,9 +2771,15 @@ public Tuple2<CloseableIterable<QueryPlan>,Boolean> getQueryRanges(ScannerFactor
}

// check for the case where we cannot handle an ivarator but the query requires an ivarator
if (IvaratorRequiredVisitor.isIvaratorRequired(queryTree) && !config.canHandleExceededValueThreshold()) {
log.debug("Needs full table scan because we exceeded the value threshold and config.canHandleExceededValueThreshold() is false");
needsFullTable = true;
if (IvaratorRequiredVisitor.isIvaratorRequired(queryTree)) {
if (!config.canHandleExceededValueThreshold()) {
log.debug("Needs full table scan because we exceeded the value threshold and config.canHandleExceededValueThreshold() is false");
needsFullTable = true;
}
if (!config.canHandleExceededTermThreshold()) {
log.debug("Needs full table scan because we exceeded the term threshold and config.canHandleExceededTermThreshold() is false");
needsFullTable = true;
}
}

stopwatch.stop();
Expand Down Expand Up @@ -2890,7 +2896,7 @@ public static <T> CloseableIterable<T> emptyCloseableIterator() {

@Override
public Iterator<T> iterator() {
return Collections.<T> emptyList().iterator();
return Collections.emptyIterator();
}

@Override
Expand Down
Expand Up @@ -45,7 +45,7 @@ public void setup() {
public void testNegationErrorCheck() throws ParseException, TableNotFoundException {
ASTJexlScript query = JexlASTHelper.parseJexlQuery("FOO == 'bar' && !(INDEXONLYFIELD == null)");

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");
HashSet<String> indexOnlyFields = new HashSet<>();
Expand Down Expand Up @@ -84,7 +84,7 @@ public void testNegationErrorCheck() throws ParseException, TableNotFoundExcepti
public void testIndexOnlyEqNull() throws ParseException, TableNotFoundException {
ASTJexlScript query = JexlASTHelper.parseJexlQuery("INDEXONLYFIELD == null");

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");

HashSet<String> indexOnlyFields = new HashSet<>();
Expand Down Expand Up @@ -125,7 +125,7 @@ public void testIndexOnlyNeNull() throws ParseException, TableNotFoundException

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");

HashSet<String> indexOnlyFields = new HashSet<>();
Expand Down Expand Up @@ -164,7 +164,7 @@ public void testIndexedEqNull() throws ParseException, TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -204,7 +204,7 @@ public void testIndexedNeNull() throws ParseException, TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -251,7 +251,7 @@ public void testNegatedAndExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -298,7 +298,7 @@ public void testDoubleNegatedAndExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -345,7 +345,7 @@ public void testNegatedAndNotExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -392,7 +392,7 @@ public void testDoubleNegatedAndNotExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -439,7 +439,7 @@ public void testNegatedOrExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -488,7 +488,7 @@ public void testDoubleNegatedOrExecutable() throws Exception {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -571,7 +571,7 @@ public void testAllOrSomeEmpty() throws TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -605,7 +605,7 @@ public void testAllOrSomeEmptyPortion() throws TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -637,7 +637,7 @@ public void testAllOrNoneEmpty() throws TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down Expand Up @@ -671,7 +671,7 @@ public void testAllOrNoneEmptyPortion() throws TableNotFoundException {

LinkedList<String> output = new LinkedList<>();

HashSet indexedFields = new HashSet();
HashSet<String> indexedFields = new HashSet<>();
indexedFields.add("INDEXONLYFIELD");
indexedFields.add("INDEXEDFIELD");

Expand Down