Skip to content

Commit

Permalink
Avoid needless string allocation in lexer
Browse files Browse the repository at this point in the history
Create the self.matched string on demand, instead of doing it every
single time. Additionally, speed up long-string parsing by advancing
multiple characters at once instead of one at a time.
  • Loading branch information
hugomg committed Jun 10, 2023
1 parent ac09382 commit cdcd017
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 36 deletions.
94 changes: 58 additions & 36 deletions src/pallene/Lexer.lua
Expand Up @@ -26,12 +26,14 @@ local newline = P"\n\r" + P"\r\n" + P"\n" + P"\r" -- See inclinenumber in llex.c
local find_newline = (1 - newline)^0 * newline

local comment_line = RE"[^\n\r]*" * newline^-1

local longstring_open = P("[") * P("=")^0 * P("[")
local longstring_content = RE".[^]]*"

local hex_number = RE"[0-9A-Fa-f]"

local string_delimiter = RE"[\"\']"
local string_content = RE"[^\n\r]"
local string_hex_number = RE"[0-9A-Fa-f][0-9A-Fa-f]?"
local string_hex_number = RE"[0-9A-Fa-f][0-9A-Fa-f]"
local string_dec_number = RE"[0-9][0-9]?[0-9]?"
local string_u_number = RE"[0-9A-Fa-f]+"

Expand Down Expand Up @@ -75,14 +77,15 @@ function Lexer:init(file_name, input)
self.pos = 1 -- Absolute position in the input
self.line = 1 -- Line number for error messages
self.col = 1 -- Column number for error messages
self.matched = false -- Last matched substring
self.old_pos = false -- Absolute position of last matched substring
end

function Lexer:loc()
return Location.new(self.file_name, self.line, self.col, self.pos)
end

-- If the given pattern matches, move the lexer forward and set self.matched.
-- If the given pattern matches, move the lexer forward.
-- If it doesn't match, unset old_pos.
-- The pattern can be either an LPEG pattern or a literal string.
local pattern_cache = {}
function Lexer:try(pat)
Expand All @@ -92,25 +95,33 @@ function Lexer:try(pat)
end
assert(lpeg.type(pat) == "pattern")

local old_pos = self.pos
local new_pos = pat:match(self.input, self.pos)
if new_pos then
self.matched = string.sub(self.input, self.pos, new_pos - 1)
local i = 1
local i = old_pos
while true do
local j = find_newline:match(self.matched, i)
if not j then break end
local j = find_newline:match(self.input, i)
if not j or j > new_pos then break end
self.line = self.line + 1
self.col = 1
i = j
end
self.col = self.col + #self.matched - i + 1
self.old_pos = old_pos
self.pos = new_pos
self.col = self.col + (new_pos - i)
return true
else
self.old_pos = false
return false
end
end

-- The substring for the last thing found by Lexer:try()
function Lexer:matched()
assert(self.old_pos)
return string.sub(self.input, self.old_pos, self.pos - 1)
end

function Lexer:read_short_string(delimiter)
local parts = {}
while not self:try(delimiter) do
Expand All @@ -119,18 +130,19 @@ function Lexer:read_short_string(delimiter)
then table.insert(parts, "\n")

elseif self:try(string_dec_number) then
local n = assert(tonumber(self.matched, 10))
local n = assert(tonumber(self:matched(), 10))
if n < 256 then
table.insert(parts, string.char(n))
else
return false, "decimal escape sequence too large"
end

elseif self:try("x") then
if self:try(string_hex_number) and #self.matched == 2 then
local n = assert(tonumber(self.matched, 16))
if self:try(string_hex_number) then
local n = assert(tonumber(self:matched(), 16))
table.insert(parts, string.char(n))
else
self:try(hex_number) -- possibly advance error location
return false, "hexadecimal digit expected"
end

Expand All @@ -141,8 +153,9 @@ function Lexer:read_short_string(delimiter)
if not self:try(string_u_number) then
return false, "hexadecimal digit expected"
end
local n = tonumber(self.matched, 16)
if #self.matched > 8 or n >= 0x7fffffff then
local s = self:matched()
local n = tonumber(s, 16)
if #s > 8 or n >= 0x7fffffff then
return false, "UTF-8 value too large"
end
if not self:try("}") then
Expand All @@ -155,18 +168,19 @@ function Lexer:read_short_string(delimiter)
self:try(space)

elseif self:try(one_char) then
local s = string_escapes[self.matched]
if s then
table.insert(parts, s)
local s = self:matched()
local c = string_escapes[s]
if c then
table.insert(parts, c)
else
return false, string.format("invalid escape sequence '\\%s'", self.matched)
return false, string.format("invalid escape sequence '\\%s'", s)
end
else
return false, "unfinished string"
end

elseif self:try(string_content) then
table.insert(parts, self.matched)
table.insert(parts, self:matched())

else
return false, "unfinished string"
Expand All @@ -180,15 +194,18 @@ function Lexer:read_long_string(delimiter_size, what)
local close = "]" .. string.rep("=", delimiter_size) .. "]"

self:try(newline)
local parts = {}
local first_pos = self.pos
local last_pos = self.pos

while not self:try(close) do
if self:try(one_char) then
table.insert(parts, self.matched)
if self:try(longstring_content) then
last_pos = self.pos
else
return false, string.format("unfinished %s (starting at line %d)", what, firstline)
end
end
return table.concat(parts)

return string.sub(self.input, first_pos, last_pos - 1)
end

function Lexer:_next()
Expand All @@ -197,46 +214,51 @@ function Lexer:_next()

elseif self:try("--") then
if self:try(longstring_open) then
local s, err = self:read_long_string(#self.matched-2, "long comment")
local len = self.pos - self.old_pos - 2
local s, err = self:read_long_string(len, "long comment")
if not s then return false, err end
return "COMMENT", s
else
self:try(comment_line)
return "COMMENT", self.matched
return "COMMENT", self:matched()
end

elseif self:try(string_delimiter) then
local s, err = self:read_short_string(self.matched)
local s, err = self:read_short_string(self:matched())
if not s then return false, err end
return "STRING", s

elseif self:try(longstring_open) then
local s, err = self:read_long_string(#self.matched-2, "long string")
local len = self.pos - self.old_pos - 2
local s, err = self:read_long_string(len, "long string")
if not s then return false, err end
return "STRING", s

elseif self:try(possible_number) then
local n = tonumber(self.matched)
local s = self:matched()
local n = tonumber(s)
if n then
return "NUMBER", n
else
return false, string.format("malformed number near '%s'", self.matched)
return false, string.format("malformed number near '%s'", s)
end

elseif self:try(symbol) then -- Must try this after numbers, because of '.'
return self.matched
return self:matched()

elseif self:try(identifier) then
if is_keyword[self.matched] then
return self.matched
local name = self:matched()
if is_keyword[name] then
return name
else
return "NAME", self.matched
return "NAME", name
end

elseif self:try(one_char) then
local what = (string.match(self.matched, "%g")
and string.format("'%s'", self.matched)
or string.format("<\\%d>", string.byte(self.matched)))
local c = self:matched()
local what = (string.match(c, "%g")
and string.format("'%s'", c)
or string.format("<\\%d>", string.byte(c)))
return false, string.format("unexpected symbol near %s", what)

else
Expand Down
1 change: 1 addition & 0 deletions src/pallene/parser.lua
Expand Up @@ -51,6 +51,7 @@ function Parser:advance()
tok, err = self.lexer:next()
if not tok then
self:syntax_error(self.lexer:loc(), "%s", err)
self:abort_parsing()
end
if tok.name == "COMMENT" then
table.insert(self.comment_regions, { tok.loc.pos, tok.end_pos })
Expand Down

0 comments on commit cdcd017

Please sign in to comment.