Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use PCRE for builtins.match and builtins.split #7336

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions configure.ac
Expand Up @@ -279,6 +279,7 @@ PKG_CHECK_MODULES([GTEST], [gtest_main])
# Look for nlohmann/json.
PKG_CHECK_MODULES([NLOHMANN_JSON], [nlohmann_json >= 3.9])

PKG_CHECK_MODULES([PCRE2], [libpcre2-8 >= 10.39])

# documentation generation switch
AC_ARG_ENABLE(doc-gen, AS_HELP_STRING([--disable-doc-gen],[disable documentation generation]),
Expand Down
1 change: 1 addition & 0 deletions flake.nix
Expand Up @@ -115,6 +115,7 @@
boost
lowdown-nix
gtest
pcre2
]
++ lib.optionals stdenv.isLinux [libseccomp]
++ lib.optional (stdenv.isLinux || stdenv.isDarwin) libsodium
Expand Down
1 change: 1 addition & 0 deletions src/libexpr/eval.cc
Expand Up @@ -2478,6 +2478,7 @@ void EvalState::printStats()
sizes.attr("Bindings", sizeof(Bindings));
sizes.attr("Attr", sizeof(Attr));
}
topObj.attr("regexCache", regexCacheSize(regexCache));
topObj.attr("nrOpUpdates", nrOpUpdates);
topObj.attr("nrOpUpdateValuesCopied", nrOpUpdateValuesCopied);
topObj.attr("nrThunks", nrThunks);
Expand Down
1 change: 1 addition & 0 deletions src/libexpr/eval.hh
Expand Up @@ -76,6 +76,7 @@ void initGC();
struct RegexCache;

std::shared_ptr<RegexCache> makeRegexCache();
size_t regexCacheSize(std::shared_ptr<RegexCache> cache);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be replaced by a RegexCache::size() method?


struct DebugTrace {
std::optional<ErrPos> pos;
Expand Down
2 changes: 1 addition & 1 deletion src/libexpr/local.mk
Expand Up @@ -15,7 +15,7 @@ libexpr_CXXFLAGS += -I src/libutil -I src/libstore -I src/libfetchers -I src/lib

libexpr_LIBS = libutil libstore libfetchers

libexpr_LDFLAGS += -lboost_context -pthread
libexpr_LDFLAGS += -lboost_context -pthread -lpcre2-8
ifdef HOST_LINUX
libexpr_LDFLAGS += -ldl
endif
Expand Down
205 changes: 0 additions & 205 deletions src/libexpr/primops.cc
Expand Up @@ -21,12 +21,10 @@

#include <algorithm>
#include <cstring>
#include <regex>
#include <dlfcn.h>

#include <cmath>


namespace nix {


Expand Down Expand Up @@ -3497,209 +3495,6 @@ static RegisterPrimOp primop_hashString({
.fun = prim_hashString,
});

struct RegexCache
{
// TODO use C++20 transparent comparison when available
std::unordered_map<std::string_view, std::regex> cache;
std::list<std::string> keys;

std::regex get(std::string_view re)
{
auto it = cache.find(re);
if (it != cache.end())
return it->second;
keys.emplace_back(re);
return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
}
};

std::shared_ptr<RegexCache> makeRegexCache()
{
return std::make_shared<RegexCache>();
}

void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
{
auto re = state.forceStringNoCtx(*args[0], pos);

try {

auto regex = state.regexCache->get(re);

PathSet context;
const auto str = state.forceString(*args[1], context, pos);

std::cmatch match;
if (!std::regex_match(str.begin(), str.end(), match, regex)) {
v.mkNull();
return;
}

// the first match is the whole string
const size_t len = match.size() - 1;
state.mkList(v, len);
for (size_t i = 0; i < len; ++i) {
if (!match[i+1].matched)
(v.listElems()[i] = state.allocValue())->mkNull();
else
(v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
}

} catch (std::regex_error & e) {
if (e.code() == std::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.debugThrowLastTrace(EvalError({
.msg = hintfmt("memory limit exceeded by regular expression '%s'", re),
.errPos = state.positions[pos]
}));
} else
state.debugThrowLastTrace(EvalError({
.msg = hintfmt("invalid regular expression '%s'", re),
.errPos = state.positions[pos]
}));
}
}

static RegisterPrimOp primop_match({
.name = "__match",
.args = {"regex", "str"},
.doc = R"s(
Returns a list if the [extended POSIX regular
expression](http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04)
*regex* matches *str* precisely, otherwise returns `null`. Each item
in the list is a regex group.

```nix
builtins.match "ab" "abc"
```

Evaluates to `null`.

```nix
builtins.match "abc" "abc"
```

Evaluates to `[ ]`.

```nix
builtins.match "a(b)(c)" "abc"
```

Evaluates to `[ "b" "c" ]`.

```nix
builtins.match "[[:space:]]+([[:upper:]]+)[[:space:]]+" " FOO "
```

Evaluates to `[ "FOO" ]`.
)s",
.fun = prim_match,
});

/* Split a string with a regular expression, and return a list of the
non-matching parts interleaved by the lists of the matching groups. */
void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
{
auto re = state.forceStringNoCtx(*args[0], pos);

try {

auto regex = state.regexCache->get(re);

PathSet context;
const auto str = state.forceString(*args[1], context, pos);

auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
auto end = std::cregex_iterator();

// Any matches results are surrounded by non-matching results.
const size_t len = std::distance(begin, end);
state.mkList(v, 2 * len + 1);
size_t idx = 0;

if (len == 0) {
v.listElems()[idx++] = args[1];
return;
}

for (auto i = begin; i != end; ++i) {
assert(idx <= 2 * len + 1 - 3);
auto match = *i;

// Add a string for non-matched characters.
(v.listElems()[idx++] = state.allocValue())->mkString(match.prefix().str());

// Add a list for matched substrings.
const size_t slen = match.size() - 1;
auto elem = v.listElems()[idx++] = state.allocValue();

// Start at 1, beacause the first match is the whole string.
state.mkList(*elem, slen);
for (size_t si = 0; si < slen; ++si) {
if (!match[si + 1].matched)
(elem->listElems()[si] = state.allocValue())->mkNull();
else
(elem->listElems()[si] = state.allocValue())->mkString(match[si + 1].str());
}

// Add a string for non-matched suffix characters.
if (idx == 2 * len)
(v.listElems()[idx++] = state.allocValue())->mkString(match.suffix().str());
}

assert(idx == 2 * len + 1);

} catch (std::regex_error & e) {
if (e.code() == std::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.debugThrowLastTrace(EvalError({
.msg = hintfmt("memory limit exceeded by regular expression '%s'", re),
.errPos = state.positions[pos]
}));
} else
state.debugThrowLastTrace(EvalError({
.msg = hintfmt("invalid regular expression '%s'", re),
.errPos = state.positions[pos]
}));
}
}

static RegisterPrimOp primop_split({
.name = "__split",
.args = {"regex", "str"},
.doc = R"s(
Returns a list composed of non matched strings interleaved with the
lists of the [extended POSIX regular
expression](http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04)
*regex* matches of *str*. Each item in the lists of matched
sequences is a regex group.

```nix
builtins.split "(a)b" "abc"
```

Evaluates to `[ "" [ "a" ] "c" ]`.

```nix
builtins.split "([ac])" "abc"
```

Evaluates to `[ "" [ "a" ] "b" [ "c" ] "" ]`.

```nix
builtins.split "(a)|(c)" "abc"
```

Evaluates to `[ "" [ "a" null ] "b" [ null "c" ] "" ]`.

```nix
builtins.split "([[:upper:]]+)" " FOO "
```

Evaluates to `[ " " [ "FOO" ] " " ]`.
)s",
.fun = prim_split,
});

static void prim_concatStringsSep(EvalState & state, const PosIdx pos, Value * * args, Value & v)
{
Expand Down