Skip to content

Commit

Permalink
Make leading '$.' optional in JSON paths accepted by json_extract[_sc…
Browse files Browse the repository at this point in the history
…alar] Presto functions

Summary:
json_extract_scalar and json_extract Presto function allow paths like 'foo'
and 'foo.bar'. These paths are non-standard. They are enabled via use of Jayway
engine, which handles paths without leading '$' by prepending the path
with '$.'. Path 'foo' becomes '$.foo'. However, this logic has some unexpected
side effects. Path '.foo' becomes '$..foo', which uses deep scan operator '..'
and matches 'foo' keys anywhere in the document, rather then just at the top
level.

This change allows Velox to support paths like 'foo' and 'foo.bar[2].bar', but
not '.foo' or '[10]'. These paths are handled as if they contained leading '$.' characters.

See prestodb/presto#22589

Part of facebookincubator#7049

Differential Revision: D56465662
  • Loading branch information
mbasmanova authored and facebook-github-bot committed Apr 23, 2024
1 parent fb9355d commit 7ad90be
Show file tree
Hide file tree
Showing 5 changed files with 197 additions and 87 deletions.
36 changes: 34 additions & 2 deletions velox/functions/prestosql/json/JsonPathTokenizer.cpp
Expand Up @@ -41,11 +41,35 @@ bool isDotKeyFormat(char c) {
} // namespace

bool JsonPathTokenizer::reset(std::string_view path) {
if (path.empty() || path[0] != ROOT) {
if (path.empty()) {
index_ = 0;
path_ = {};
return false;
}

if (path[0] == ROOT) {
index_ = 1;
path_ = path;
return true;
}

// Presto supplements basic JsonPath implementation with Jayway engine,
// which allows paths that do not start with '$'. Jayway simply prepends
// such paths with '$.'. This logic supports paths like 'foo' by converting
// them to '$.foo'. However, this logic converts paths like '.foo' into
// '$..foo' which uses deep scan operator (..). This changes the meaning of
// the path in unexpected way.
//
// Here, we allow paths like 'foo', 'foo.bar' and similar. We do not allow
// paths like '.foo' or '[0]'.

index_ = 0;
if (path[0] == DOT || path[0] == OPEN_BRACKET) {
path_ = {};
return false;
}
index_ = 1;
path_ = path;

return true;
}

Expand All @@ -54,9 +78,16 @@ bool JsonPathTokenizer::hasNext() const {
}

std::optional<std::string> JsonPathTokenizer::getNext() {
if (index_ == 0) {
// We are parsing first token in a path that doesn't start with '$'. In
// this case, we assume the path starts with '$.'.
return matchDotKey();
}

if (match(DOT)) {
return matchDotKey();
}

if (match(OPEN_BRACKET)) {
auto token =
match(QUOTE) ? matchQuotedSubscriptKey() : matchUnquotedSubscriptKey();
Expand All @@ -65,6 +96,7 @@ std::optional<std::string> JsonPathTokenizer::getNext() {
}
return token;
}

return std::nullopt;
}

Expand Down
34 changes: 33 additions & 1 deletion velox/functions/prestosql/json/JsonPathTokenizer.h
Expand Up @@ -21,12 +21,40 @@

namespace facebook::velox::functions {

/// Splits a JSON path into individual tokens.
///
/// Supports a subset of JSONPath syntax:
/// https://goessner.net/articles/JsonPath/
///
/// $ - the root element
/// . or [] - child operator
/// * - wildcard (all objects/elements regardless their names)
///
/// Supports quoted keys.
///
/// Notably, doesn't support deep scan, e.g. $..author.
///
/// The leading '$.' is optional. Paths '$.foo.bar' and 'foo.bar' are
/// equivalent. This is not part of JSONPath syntax, but this is the behavior of
/// Jayway implementation used by Presto.
///
/// Examples:
/// "$.store.book[0].author"
/// "store.book[0].author"
class JsonPathTokenizer {
public:
/// Resets the tokenizer to a new path. This method must be called and return
/// true before calling hasNext or getNext.
///
/// @return true if path is not empty and first character suggests a possibly
/// valid path.
bool reset(std::string_view path);

/// Returns true if there are more tokens to be extracted.
bool hasNext() const;

/// Returns the next token or std::nullopt if there are no more tokens left or
/// the remaining path is invalid.
std::optional<std::string> getNext();

private:
Expand All @@ -38,8 +66,12 @@ class JsonPathTokenizer {

std::optional<std::string> matchQuotedSubscriptKey();

private:
// The index of the next character to process. This is at least one for
// standard paths that start with '$'. This can be zero if 'reset' was called
// with a non-standard path that doesn't have a leading '$'.
size_t index_;

// The path specified in 'reset'.
std::string_view path_;
};

Expand Down
170 changes: 93 additions & 77 deletions velox/functions/prestosql/json/tests/JsonPathTokenizerTest.cpp
Expand Up @@ -17,41 +17,15 @@
#include "velox/functions/prestosql/json/JsonPathTokenizer.h"

#include <vector>

#include "gtest/gtest.h"

using namespace std::string_literals;
using facebook::velox::functions::JsonPathTokenizer;
using TokenList = std::vector<std::string>;

#define EXPECT_TOKEN_EQ(path, expected) \
{ \
auto jsonPath = path; \
auto tokens = getTokens(jsonPath); \
EXPECT_TRUE(bool(tokens)); \
EXPECT_EQ(expected, tokens.value()); \
}

#define EXPECT_TOKEN_INVALID(path) \
{ \
auto tokens = getTokens(path); \
EXPECT_FALSE(bool(tokens)); \
}
namespace facebook::velox::functions {
namespace {

#define EXPECT_QUOTED_TOKEN_EQ(path, expected) \
{ \
auto quotedPath = "$[\"" + path + "\"]"; \
EXPECT_TOKEN_EQ(quotedPath, expected); \
}

#define EXPECT_UNQUOTED_TOKEN_INVALID(path) \
{ \
auto invalidPath = "$." + path; \
EXPECT_TOKEN_INVALID(invalidPath); \
}

// The test is ported from Presto for compatibility
std::optional<TokenList> getTokens(const std::string& path) {
std::optional<TokenList> getTokens(std::string_view path) {
JsonPathTokenizer tokenizer;
if (!tokenizer.reset(path)) {
return std::nullopt;
Expand All @@ -69,58 +43,100 @@ std::optional<TokenList> getTokens(const std::string& path) {
return tokens;
}

TEST(JsonPathTokenizerTest, tokenizeTest) {
EXPECT_TOKEN_EQ("$"s, TokenList());
EXPECT_TOKEN_EQ("$.foo"s, TokenList{"foo"s});
EXPECT_TOKEN_EQ("$[\"foo\"]"s, TokenList{"foo"s});
EXPECT_TOKEN_EQ("$[\"foo.bar\"]"s, TokenList{"foo.bar"s});
EXPECT_TOKEN_EQ("$[42]"s, TokenList{"42"s});
EXPECT_TOKEN_EQ("$.42"s, TokenList{"42"s});
EXPECT_TOKEN_EQ("$.42.63"s, (TokenList{"42"s, "63"s}));
EXPECT_TOKEN_EQ(
"$.foo.42.bar.63"s, (TokenList{"foo"s, "42"s, "bar"s, "63"s}));
EXPECT_TOKEN_EQ("$.x.foo"s, (TokenList{"x"s, "foo"s}));
EXPECT_TOKEN_EQ("$.x[\"foo\"]"s, (TokenList{"x"s, "foo"s}));
EXPECT_TOKEN_EQ("$.x[42]"s, (TokenList{"x"s, "42"s}));
EXPECT_TOKEN_EQ("$.foo_42._bar63"s, (TokenList{"foo_42"s, "_bar63"s}));
EXPECT_TOKEN_EQ("$[foo_42][_bar63]"s, (TokenList{"foo_42"s, "_bar63"s}));
EXPECT_TOKEN_EQ("$.foo:42.:bar63"s, (TokenList{"foo:42"s, ":bar63"s}));
EXPECT_TOKEN_EQ(
"$[\"foo:42\"][\":bar63\"]"s, (TokenList{"foo:42"s, ":bar63"s}));
EXPECT_TOKEN_EQ(
// 'path' should start with '$'.
void assertValidPath(std::string_view path, const TokenList& expectedTokens) {
auto tokens = getTokens(path);

EXPECT_TRUE(tokens.has_value()) << "Invalid JSON path: " << path;
if (!tokens.has_value()) {
return;
}

EXPECT_EQ(expectedTokens, tokens.value());
}

void assertQuotedToken(
const std::string& token,
const TokenList& expectedTokens) {
auto quotedPath = "$[\"" + token + "\"]";
assertValidPath(quotedPath, expectedTokens);

auto invalidPath = "$." + token;
EXPECT_FALSE(getTokens(invalidPath));
}

// The test is ported from Presto for compatibility.
TEST(JsonPathTokenizerTest, validPaths) {
assertValidPath("$"s, TokenList());
assertValidPath("$.foo"s, TokenList{"foo"s});
assertValidPath("$[\"foo\"]"s, TokenList{"foo"s});
assertValidPath("$[\"foo.bar\"]"s, TokenList{"foo.bar"s});
assertValidPath("$[42]"s, TokenList{"42"s});
assertValidPath("$.42"s, TokenList{"42"s});
assertValidPath("$.42.63"s, TokenList{"42"s, "63"s});
assertValidPath("$.foo.42.bar.63"s, TokenList{"foo"s, "42"s, "bar"s, "63"s});
assertValidPath("$.x.foo"s, TokenList{"x"s, "foo"s});
assertValidPath("$.x[\"foo\"]"s, TokenList{"x"s, "foo"s});
assertValidPath("$.x[42]"s, TokenList{"x"s, "42"s});
assertValidPath("$.foo_42._bar63"s, TokenList{"foo_42"s, "_bar63"s});
assertValidPath("$[foo_42][_bar63]"s, TokenList{"foo_42"s, "_bar63"s});
assertValidPath("$.foo:42.:bar63"s, TokenList{"foo:42"s, ":bar63"s});
assertValidPath(
"$[\"foo:42\"][\":bar63\"]"s, TokenList{"foo:42"s, ":bar63"s});
assertValidPath(
"$.store.fruit[*].weight",
(TokenList{"store"s, "fruit"s, "*"s, "weight"s}));
EXPECT_QUOTED_TOKEN_EQ("!@#$%^&*()[]{}/?'"s, TokenList{"!@#$%^&*()[]{}/?'"s});
EXPECT_UNQUOTED_TOKEN_INVALID("!@#$%^&*()[]{}/?'"s);
EXPECT_QUOTED_TOKEN_EQ("ab\u0001c"s, TokenList{"ab\u0001c"s});
EXPECT_UNQUOTED_TOKEN_INVALID("ab\u0001c"s);
EXPECT_QUOTED_TOKEN_EQ("ab\0c"s, TokenList{"ab\0c"s});
EXPECT_UNQUOTED_TOKEN_INVALID("ab\0c"s);
EXPECT_QUOTED_TOKEN_EQ("ab\t\n\rc"s, TokenList{"ab\t\n\rc"s});
EXPECT_UNQUOTED_TOKEN_INVALID("ab\t\n\rc"s);
EXPECT_QUOTED_TOKEN_EQ("."s, TokenList{"."s});
EXPECT_UNQUOTED_TOKEN_INVALID("."s);
EXPECT_QUOTED_TOKEN_EQ("$"s, TokenList{"$"s});
EXPECT_UNQUOTED_TOKEN_INVALID("$"s);
EXPECT_QUOTED_TOKEN_EQ("]"s, TokenList{"]"s});
EXPECT_UNQUOTED_TOKEN_INVALID("]"s);
EXPECT_QUOTED_TOKEN_EQ("["s, TokenList{"["s});
EXPECT_UNQUOTED_TOKEN_INVALID("["s);
EXPECT_QUOTED_TOKEN_EQ("'"s, TokenList{"'"s});
EXPECT_UNQUOTED_TOKEN_INVALID("'"s);
EXPECT_QUOTED_TOKEN_EQ(
TokenList{"store"s, "fruit"s, "*"s, "weight"s});
assertValidPath(
"$.store.book[*].author", TokenList{"store", "book", "*", "author"});
assertValidPath("$.store.*", TokenList({"store", "*"}));

// Paths without leading '$'.
assertValidPath("foo", TokenList({"foo"}));
assertValidPath("foo[12].bar", TokenList({"foo", "12", "bar"}));
assertValidPath("foo.bar.baz", TokenList({"foo", "bar", "baz"}));

assertQuotedToken("!@#$%^&*()[]{}/?'"s, TokenList{"!@#$%^&*()[]{}/?'"s});
assertQuotedToken("ab\u0001c"s, TokenList{"ab\u0001c"s});
assertQuotedToken("ab\0c"s, TokenList{"ab\0c"s});
assertQuotedToken("ab\t\n\rc"s, TokenList{"ab\t\n\rc"s});
assertQuotedToken("."s, TokenList{"."s});
assertQuotedToken("$"s, TokenList{"$"s});
assertQuotedToken("]"s, TokenList{"]"s});
assertQuotedToken("["s, TokenList{"["s});
assertQuotedToken("'"s, TokenList{"'"s});
assertQuotedToken(
"!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s,
TokenList{"!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s});
EXPECT_UNQUOTED_TOKEN_INVALID("!@#$%^&*(){}[]<>?/|.,`~\r\n\t \0"s);
EXPECT_QUOTED_TOKEN_EQ("a\\\\b\\\""s, TokenList{"a\\b\""s});
EXPECT_UNQUOTED_TOKEN_INVALID("a\\\\b\\\""s);
EXPECT_QUOTED_TOKEN_EQ("ab\\\"cd\\\"ef"s, TokenList{"ab\"cd\"ef"s});
assertQuotedToken("a\\\\b\\\""s, TokenList{"a\\b\""s});
assertQuotedToken("ab\\\"cd\\\"ef"s, TokenList{"ab\"cd\"ef"s});
}

// backslash not followed by valid escape
EXPECT_TOKEN_INVALID("$[\"a\\ \"]"s);
TEST(JsonPathTokenizerTest, invalidPaths) {
JsonPathTokenizer tokenizer;

// Empty path.
EXPECT_FALSE(getTokens(""));

// Backslash not followed by valid escape.
EXPECT_FALSE(getTokens("$[\"a\\ \"]"s));

// colon in subscript must be quoted
EXPECT_TOKEN_INVALID("$[foo:bar]"s);
// Colon in subscript must be quoted.
EXPECT_FALSE(getTokens("$[foo:bar]"s));

EXPECT_TOKEN_INVALID("$.store.book[");
// Open bracket without close bracket.
EXPECT_FALSE(getTokens("$.store.book["));

// Unsupported deep scan operator.
EXPECT_FALSE(getTokens("$..store"));
EXPECT_FALSE(getTokens("..store"));
EXPECT_FALSE(getTokens("$..[3].foo"));
EXPECT_FALSE(getTokens("..[3].foo"));

// Paths without leading '$'.
EXPECT_FALSE(getTokens("[1].foo"));
EXPECT_FALSE(getTokens(".[1].foo"));
EXPECT_FALSE(getTokens(".foo.bar.baz"));
}

} // namespace
} // namespace facebook::velox::functions
35 changes: 28 additions & 7 deletions velox/functions/prestosql/tests/JsonExtractScalarTest.cpp
Expand Up @@ -14,6 +14,7 @@
* limitations under the License.
*/

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"
#include "velox/functions/prestosql/types/JsonType.h"

Expand Down Expand Up @@ -103,6 +104,10 @@ TEST_F(JsonExtractScalarTest, simple) {
jsonExtractScalar(
R"([{"k1":[{"k2": ["v1", "v2"]}]}])", "$[0].k1[0].k2[1]"),
"v2");

// Paths without leading '$'.
EXPECT_EQ(jsonExtractScalar(R"({"k1":"v1"})", "k1"), "v1");
EXPECT_EQ(jsonExtractScalar(R"({"k1":{"k2": 999}})", "k1.k2"), "999");
}

TEST_F(JsonExtractScalarTest, jsonType) {
Expand Down Expand Up @@ -164,13 +169,29 @@ TEST_F(JsonExtractScalarTest, utf8) {
}

TEST_F(JsonExtractScalarTest, invalidPath) {
EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", ""), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", "$[]"), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"([0,1,2])", "$[-1]"), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$k1"), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$.k1."), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1"})", "$.k1]"), VeloxUserError);
EXPECT_THROW(jsonExtractScalar(R"({"k1":"v1)", "$.k1]"), VeloxUserError);
VELOX_ASSERT_THROW(jsonExtractScalar(R"([0,1,2])", ""), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"([0,1,2])", "$[]"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"([0,1,2])", "$[-1]"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":"v1"})", "$k1"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":"v1"})", "$.k1."), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":"v1"})", "$.k1]"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":"v1)", "$.k1]"), "Invalid JSON path");

// Paths without leading '$'.
VELOX_ASSERT_THROW(jsonExtractScalar(R"([1,2])", "[0]"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"([1,2])", ".[0]"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":"v1"})", ".k1"), "Invalid JSON path");
VELOX_ASSERT_THROW(
jsonExtractScalar(R"({"k1":{"k2": 999}})", ".k1.k2"),
"Invalid JSON path");
}

// simdjson, like Presto java, returns the large number as-is as a string,
Expand Down
9 changes: 9 additions & 0 deletions velox/functions/prestosql/tests/JsonFunctionsTest.cpp
Expand Up @@ -601,6 +601,15 @@ TEST_F(JsonFunctionsTest, jsonExtract) {
jsonExtract(kJson, "$.store.book[*].isbn"));
EXPECT_EQ("\"Evelyn Waugh\"", jsonExtract(kJson, "$.store.book[1].author"));

// Paths without leading '$'.
auto json = R"({"x": {"a": 1, "b": [10, 11, 12]} })";
EXPECT_EQ(R"({"a": 1, "b": [10, 11, 12]})", jsonExtract(json, "x"));
EXPECT_EQ("1", jsonExtract(json, "x.a"));
EXPECT_EQ("[10, 11, 12]", jsonExtract(json, "x.b"));
EXPECT_EQ("12", jsonExtract(json, "x.b[2]"));
EXPECT_EQ(std::nullopt, jsonExtract(json, "x.c"));
EXPECT_EQ(std::nullopt, jsonExtract(json, "x.b[20]"));

// TODO The following paths are supported by Presto via Jayway, but do not
// work in Velox yet. Figure out how to add support for these.
VELOX_ASSERT_THROW(jsonExtract(kJson, "$..price"), "Invalid JSON path");
Expand Down

0 comments on commit 7ad90be

Please sign in to comment.