Skip to content

Commit

Permalink
A little more regex source generator tweaking (#62605)
Browse files Browse the repository at this point in the history
* Remove now unecessary "Variable declared but never used" warning suppression

* Avoid declaring inputSpan in FindFirstChar if we don't have to

* Add RegexNode.Ref/Bol/Eol to non-backtracking list

* Move some additionalDeclarations back to where they're used

I was overaggressive in moving these to the beginning.  Some are fine where they're needed.

* Delete dead code

* Emit IsEmpty for more slice.Length checks

* Apply suggestions from code review
  • Loading branch information
stephentoub committed Dec 10, 2021
1 parent 77367fa commit 61b5c92
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 77 deletions.
Expand Up @@ -36,7 +36,6 @@ public partial class RegexGenerator
"#nullable enable",
"#pragma warning disable CS0162 // Unreachable code",
"#pragma warning disable CS0164 // Unreferenced label",
"#pragma warning disable CS0168 // Variable declared but never used",
"#pragma warning disable CS0219 // Variable assigned but never used",
"",
};
Expand Down Expand Up @@ -274,13 +273,11 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
bool hasTextInfo = false;

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
// built up while generating code.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
var additionalDeclarations = new HashSet<string>();

// Emit locals initialization
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.Flush();
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
Expand Down Expand Up @@ -315,15 +312,17 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
{
case FindNextStartingPositionMode.LeadingPrefix_LeftToRight_CaseSensitive:
Debug.Assert(!string.IsNullOrEmpty(code.FindOptimizations.LeadingCaseSensitivePrefix));
EmitIndexOf_LeftToRight(code.FindOptimizations.LeadingCaseSensitivePrefix);
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitIndexOf(code.FindOptimizations.LeadingCaseSensitivePrefix);
break;

case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.FixedSets_LeftToRight_CaseInsensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseSensitive:
case FindNextStartingPositionMode.LeadingSet_LeftToRight_CaseInsensitive:
Debug.Assert(code.FindOptimizations.FixedDistanceSets is { Count: > 0 });
EmitFixedSet_LeftToRight();
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
EmitFixedSet();
break;

default:
Expand All @@ -338,7 +337,7 @@ private static void EmitFindFirstChar(IndentedTextWriter writer, RegexMethod rm,
}
writer.WriteLine();

writer.WriteLine("// No match");
writer.WriteLine("// No starting position found");
writer.WriteLine("ReturnFalse:");
writer.WriteLine("base.runtextpos = end;");
writer.WriteLine("return false;");
Expand Down Expand Up @@ -368,8 +367,7 @@ bool EmitAnchors()

case RegexPrefixAnalyzer.Start:
writer.WriteLine("// Start \\G anchor");
additionalDeclarations.Add("int start = base.runtextstart;");
using (EmitBlock(writer, "if (pos > start)"))
using (EmitBlock(writer, "if (pos > base.runtextstart)"))
{
writer.WriteLine("goto ReturnFalse;");
}
Expand Down Expand Up @@ -400,6 +398,7 @@ bool EmitAnchors()
// the other anchors, which all skip all subsequent processing if found, with BOL we just use it
// to boost our position to the next line, and then continue normally with any searches.
writer.WriteLine("// Beginning-of-line anchor");
additionalDeclarations.Add("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
additionalDeclarations.Add("int beginning = base.runtextbeg;");
using (EmitBlock(writer, "if (pos > beginning && inputSpan[pos - 1] != '\\n')"))
{
Expand All @@ -418,8 +417,8 @@ bool EmitAnchors()
return false;
}

// Emits a case-sensitive left-to-right prefix search for a string at the beginning of the pattern.
void EmitIndexOf_LeftToRight(string prefix)
// Emits a case-sensitive prefix search for a string at the beginning of the pattern.
void EmitIndexOf(string prefix)
{
writer.WriteLine($"int i = global::System.MemoryExtensions.IndexOf(inputSpan.Slice(pos, end - pos), {Literal(prefix)});");
writer.WriteLine("if (i >= 0)");
Expand All @@ -429,9 +428,9 @@ void EmitIndexOf_LeftToRight(string prefix)
writer.WriteLine("}");
}

// Emits a left-to-right search for a set at a fixed position from the start of the pattern,
// Emits a search for a set at a fixed position from the start of the pattern,
// and potentially other sets at other fixed positions in the pattern.
void EmitFixedSet_LeftToRight()
void EmitFixedSet()
{
List<(char[]? Chars, string Set, int Distance, bool CaseInsensitive)>? sets = code.FindOptimizations.FixedDistanceSets;
(char[]? Chars, string Set, int Distance, bool CaseInsensitive) primarySet = sets![0];
Expand Down Expand Up @@ -600,15 +599,14 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)

RegexOptions options = (RegexOptions)rm.Options;
RegexCode code = rm.Code;
bool hasTimeout = false;

// Helper to define names. Names start unadorned, but as soon as there's repetition,
// they begin to have a numbered suffix.
var usedNames = new Dictionary<string, int>();

// Every RegexTree is rooted in the implicit Capture for the whole expression.
// Skip the Capture node. We handle the implicit root capture specially.
RegexNode node = rm.Code.Tree.Root;
RegexNode node = code.Tree.Root;
Debug.Assert(node.Type == RegexNode.Capture, "Every generated tree should begin with a capture node");
Debug.Assert(node.ChildCount() == 1, "Capture nodes should have one child");
node = node.Child(0);
Expand All @@ -635,9 +633,8 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
}

// In some cases, we need to emit declarations at the beginning of the method, but we only discover we need them later.
// To handle that, we emit a placeholder value that's not valid C#, and then at the end of the code generation we either
// delete it if no additional declarations are required, or we replace it with the list of additional declarations
// built up while generating code.
// To handle that, we build up a collection of all the declarations to include, track where they should be inserted,
// and then insert them at that position once everything else has been output.
var additionalDeclarations = new HashSet<string>();
var additionalLocalFunctions = new Dictionary<string, string[]>();

Expand All @@ -646,14 +643,11 @@ private static void EmitGo(IndentedTextWriter writer, RegexMethod rm, string id)
writer.WriteLine("global::System.ReadOnlySpan<char> inputSpan = base.runtext;");
writer.WriteLine("int pos = base.runtextpos, end = base.runtextend;");
writer.WriteLine($"int original_pos = pos;");
hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
bool hasTimeout = EmitLoopTimeoutCounterIfNeeded(writer, rm);
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);
writer.Flush();
int additionalDeclarationsPosition = ((StringWriter)writer.InnerWriter).GetStringBuilder().Length;
int additionalDeclarationsIndent = writer.Indent;
writer.WriteLine();

// TextInfo textInfo = CultureInfo.CurrentCulture.TextInfo; // only if the whole expression or any subportion is ignoring case, and we're not using invariant
bool hasTextInfo = EmitInitializeCultureForGoIfNecessary(writer, rm);

// The implementation tries to use const indexes into the span wherever possible, which we can do
// for all fixed-length constructs. In such cases (e.g. single chars, repeaters, strings, etc.)
Expand Down Expand Up @@ -936,8 +930,7 @@ void EmitAllBranches()

// Save off pos. We'll need to reset this each time a branch fails.
string startingPos = ReserveName("alternation_starting_pos");
additionalDeclarations.Add($"int {startingPos} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"int {startingPos} = pos;");
int startingSliceStaticPos = sliceStaticPos;

// We need to be able to undo captures in two situations:
Expand All @@ -964,8 +957,7 @@ void EmitAllBranches()
if (expressionHasCaptures && ((node.Options & RegexNode.HasCapturesFlag) != 0 || !isAtomic))
{
startingCapturePos = ReserveName("alternation_starting_capturepos");
additionalDeclarations.Add($"int {startingCapturePos} = 0;");
writer.WriteLine($"{startingCapturePos} = base.Crawlpos();");
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
}
writer.WriteLine();

Expand Down Expand Up @@ -1211,7 +1203,7 @@ void EmitBackreferenceConditional(RegexNode node)
// to backtrack to. So, we expose a single Backtrack label and track which branch was
// followed in this resumeAt local.
string resumeAt = ReserveName("conditionalbackreference_branch");
additionalDeclarations.Add($"int {resumeAt} = 0;");
writer.WriteLine($"int {resumeAt} = 0;");

// While it would be nicely readable to use an if/else block, if the branches contain
// anything that triggers backtracking, labels will end up being defined, and if they're
Expand Down Expand Up @@ -1340,7 +1332,12 @@ void EmitExpressionConditional(RegexNode node)
{
startingCapturePos = ReserveName("conditionalexpression_starting_capturepos");
writer.WriteLine($"int {startingCapturePos} = base.Crawlpos();");
writer.WriteLine();
}

string resumeAt = ReserveName("conditionalexpression_resumeAt");
if (!isAtomic)
{
writer.WriteLine($"int {resumeAt} = 0;");
}

// Emit the conditional expression. We need to reroute any match failures to either the "no" branch
Expand All @@ -1353,13 +1350,7 @@ void EmitExpressionConditional(RegexNode node)
{
doneLabel = originalDoneLabel;
}

string postConditionalDoneLabel = doneLabel;
string resumeAt = ReserveName("conditionalexpression_resumeAt");
if (!isAtomic)
{
additionalDeclarations.Add($"int {resumeAt} = 0;");
}

// If we get to this point of the code, the conditional successfully matched, so run the "yes" branch.
// Since the "yes" branch may have a different execution path than the "no" branch or the lack of
Expand All @@ -1370,10 +1361,6 @@ void EmitExpressionConditional(RegexNode node)
writer.WriteLine();
TransferSliceStaticPosToPos(); // ensure all subsequent code sees the same sliceStaticPos value by setting it to 0
string postYesDoneLabel = doneLabel;
if (!isAtomic && postYesDoneLabel != originalDoneLabel)
{
writer.WriteLine($"{resumeAt} = 0;");
}
if (postYesDoneLabel != originalDoneLabel || noBranch is not null)
{
writer.WriteLine($"goto {end};");
Expand Down Expand Up @@ -1467,8 +1454,7 @@ void EmitCapture(RegexNode node, RegexNode? subsequent = null)

TransferSliceStaticPosToPos();
string startingPos = ReserveName("capture_starting_pos");
additionalDeclarations.Add($"int {startingPos} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"int {startingPos} = pos;");
writer.WriteLine();

RegexNode child = node.Child(0);
Expand Down Expand Up @@ -1604,7 +1590,8 @@ void EmitNegativeLookaheadAssertion(RegexNode node)
RegexNode.Oneloopatomic or RegexNode.Notoneloopatomic or RegexNode.Setloopatomic or // same for atomic loops
RegexNode.One or RegexNode.Notone or RegexNode.Set or // individual characters don't backtrack
RegexNode.Multi or // multiple characters don't backtrack
RegexNode.Beginning or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
RegexNode.Ref or // backreferences don't backtrack
RegexNode.Beginning or RegexNode.Bol or RegexNode.Start or RegexNode.End or RegexNode.EndZ or RegexNode.Eol or RegexNode.Boundary or RegexNode.NonBoundary or RegexNode.ECMABoundary or RegexNode.NonECMABoundary or // anchors don't backtrack
RegexNode.Nothing or RegexNode.Empty or RegexNode.UpdateBumpalong // empty/nothing don't do anything
// Fixed-size repeaters of single characters or atomic don't backtrack
|| node.Type is RegexNode.Oneloop or RegexNode.Notoneloop or RegexNode.Setloop or RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy && node.M == node.N
Expand Down Expand Up @@ -1965,26 +1952,30 @@ void EmitAnchors(RegexNode node)
break;

case RegexNode.End:
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos})"))
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()})"))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

case RegexNode.EndZ:
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
writer.WriteLine($"if ({sliceSpan}.Length - 1 > {sliceStaticPos} || ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n'))");
using (EmitBlock(writer, null))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

case RegexNode.Eol:
using (EmitBlock(writer, $"if ({sliceSpan}.Length > {sliceStaticPos} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
using (EmitBlock(writer, $"if ({IsSliceLengthGreaterThanSliceStaticPos()} && {sliceSpan}[{sliceStaticPos}] != '\\n')"))
{
writer.WriteLine($"goto {doneLabel};");
}
break;

string IsSliceLengthGreaterThanSliceStaticPos() =>
sliceStaticPos == 0 ? $"!{sliceSpan}.IsEmpty" :
$"{sliceSpan}.Length > {sliceStaticPos}";
}
}

Expand Down Expand Up @@ -2222,8 +2213,7 @@ void EmitSingleCharLazy(RegexNode node, bool emitLengthChecksIfRequired = true)
maxIterations = $"{node.N - node.M}";

iterationCount = ReserveName("lazyloop_iteration");
additionalDeclarations.Add($"int {iterationCount} = 0;");
writer.WriteLine($"{iterationCount} = 0;");
writer.WriteLine($"int {iterationCount} = 0;");
}

// Track the current crawl position. Upon backtracking, we'll unwind any captures beyond this point.
Expand Down Expand Up @@ -2366,18 +2356,15 @@ void EmitLazy(RegexNode node)
string body = ReserveName("LazyLoopBody");
string endLoop = ReserveName("LazyLoopEnd");

additionalDeclarations.Add($"int {iterationCount} = 0, {startingPos} = 0, {sawEmpty} = 0;");
writer.WriteLine($"{iterationCount} = 0;");
writer.WriteLine($"{startingPos} = pos;");
writer.WriteLine($"{sawEmpty} = 0;");
writer.WriteLine();
writer.WriteLine($"int {iterationCount} = 0, {startingPos} = pos, {sawEmpty} = 0;");

// If the min count is 0, start out by jumping right to what's after the loop. Backtracking
// will then bring us back in to do further iterations.
if (minIterations == 0)
{
writer.WriteLine($"goto {endLoop};");
}
writer.WriteLine();

// Iteration body
MarkLabel(body, emitSemicolon: false);
Expand Down Expand Up @@ -3279,7 +3266,7 @@ private static void ReplaceAdditionalDeclarations(IndentedTextWriter writer, Has
{
if (declarations.Count != 0)
{
StringBuilder tmp = new StringBuilder().AppendLine();
var tmp = new StringBuilder();
foreach (string decl in declarations.OrderBy(s => s))
{
for (int i = 0; i < indent; i++)
Expand Down

0 comments on commit 61b5c92

Please sign in to comment.