-
Notifications
You must be signed in to change notification settings - Fork 30
/
Parser.ts
333 lines (296 loc) · 10.4 KB
/
Parser.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
/* eslint-disable @typescript-eslint/no-explicit-any */
/* eslint-disable no-useless-constructor */
/* eslint-disable max-len */
/* eslint-disable no-constant-condition */
/* eslint-disable no-empty */
/* eslint-disable no-inner-declarations */
/* eslint-disable implicit-arrow-linebreak */
// 也叫ParseC.把语法规则和词法规则翻译成了一系列parser的 “组合”.
// https://qszhu.github.io/2021/08/22/parser-combinators.html
// https://github.com/francisrstokes/arcsecond
// https://time.geekbang.org/column/intro/436
// 缺点:各种地方都慢,因为是暴力匹配ll(k),绝大多数情况ll(1)顶多ll(2)就够用了
// 状态载体:Parser
// 基本单元:str,regExp,Whitespace,Ignored,StringLiteral,NumberLiteral
// 带空格的基本单元:token,regExpToken
// 修饰符:zeroOrMore,zeroOrOne,oneOrMore,oneOf,seqOf
// 避免循环依赖:lazy
// 工具函数: between,sepBy...
interface IParserState<T> {
/** 原始的输入字符串. */
origin: string
/** 当前的解析位置. */
index: number
/** 解析过程中是否出错.不使用try-catch来加速. */
hasError: boolean
/** 当前的解析结果. */
result?: T
}
/** 解析器函数.输一个状态,返回一个 **新的** 状态. */
type ParserFn<T> = (state: IParserState<any>) => IParserState<T>
class Parser<T = any> {
/**
* @param parserFn 解析器函数.输一个状态,返回一个 **新的** 状态.
*/
constructor(readonly parserFn: ParserFn<T>) {}
parse(s: string): IParserState<T> {
const initState = { origin: s, index: 0, hasError: false }
const nextState = this.parserFn(initState)
if (nextState.index !== s.length) nextState.hasError = true
return nextState
}
map<U>(f: (parsedResult: T) => U): Parser<U> {
return new Parser(state => {
const nextState: IParserState<any> = this.parserFn(state)
if (nextState.hasError) return nextState
nextState.result = f(nextState.result)
return nextState
})
}
}
function str<S extends string>(s: S): Parser<S> {
return new Parser(state => {
if (state.hasError) return state
const { origin, index } = state
if (!origin.startsWith(s, index)) return { ...state, hasError: true }
return { ...state, index: index + s.length, result: s }
})
}
/**
* 支持正则匹配的parser构造函数.
*/
function regExp(pattern: RegExp): Parser<string> {
return new Parser(state => {
if (state.hasError) return state
// !如果正则表达式不是以^开头的话,则可能在某个中间位置匹配到结果,那样就相当于跳过了某些字符做了匹配
if (!pattern.source.startsWith('^')) {
throw new Error(`regExp: "${pattern}" should start with "^"`)
}
const { origin, index } = state
const matching = origin.slice(index).match(pattern)
if (matching == null) {
return { ...state, hasError: true }
}
const [res] = matching
return { ...state, index: index + res.length, result: res }
})
}
/**
* 对应正则表达式中的 `*`,表示匹配 0 次或多次.
* 不会抛出错误,不能匹配到的话只会返回空的结果。
*/
function zeroOrMore<T>(parser: Parser<T>): Parser<T[]> {
return new Parser(state => {
if (state.hasError) return state
const res: T[] = []
let nextState = state
while (true) {
const testState = parser.parserFn(nextState)
if (testState.hasError) break
nextState = testState
res.push(nextState.result)
}
return { ...nextState, result: res }
})
}
/**
* 对应正则表达式中的 `?`,表示匹配 0 次或 1 次.
* 不会抛出错误,不能匹配到的话只会返回空的结果。
*/
function zeroOrOne<T>(parser: Parser<T>): Parser<T | undefined> {
return new Parser(state => {
if (state.hasError) return state
const nextState = parser.parserFn(state)
if (nextState.hasError) return { ...state, result: undefined }
return nextState
})
}
/**
* 对应正则表达式中的 `+`,表示匹配 1 次或多次.
* 至少匹配一次,否则抛出错误.
*/
function oneOrMore<T>(parser: Parser<T>): Parser<T[]> {
return new Parser(state => {
if (state.hasError) return state
const res: T[] = []
let nextState = state
while (true) {
nextState = parser.parserFn(nextState)
if (nextState.hasError) break
res.push(nextState.result)
}
if (res.length) return { ...nextState, result: res, hasError: false }
return { ...state, hasError: true }
})
}
/**
* 对应正则表达式中的 `|`.
* 至少匹配一次,否则抛出错误.
*/
function oneOf<T extends readonly Parser<unknown>[]>(
...parsers: T
): Parser<{ [K in keyof T]: T[K] extends Parser<infer U> ? U : never }[number]> {
return new Parser(state => {
if (state.hasError) return state
for (let i = 0; i < parsers.length; i++) {
const p = parsers[i]
const nextState = p.parserFn(state)
if (!nextState.hasError) return nextState
}
return { ...state, hasError: true }
})
}
/**
* 构造匹配一个`序列`的parser.
* 传入的parser必须依次匹配成功,否则会抛出错误.
*/
function seqOf<T extends readonly Parser<unknown>[]>(
...parsers: T
): Parser<{ [K in keyof T]: T[K] extends Parser<infer U> ? U : never }> {
return new Parser(state => {
if (state.hasError) return state
const res: unknown[] = []
let nextState = state
for (let i = 0; i < parsers.length; i++) {
const p = parsers[i]
nextState = p.parserFn(nextState)
if (nextState.hasError) return { ...state, hasError: true }
res.push(nextState.result)
}
return { ...nextState, result: res }
})
}
/**
* @see {@link https://github.com/francisrstokes/arcsecond}
*/
const between =
(left: Parser<unknown>, right: Parser<unknown>) =>
<T>(content: Parser<T>) =>
seqOf(left, content, right).map(([_, res]) => res)
/**
* @see {@link https://github.com/francisrstokes/arcsecond}
*/
const sepBy =
(sep: Parser<any>) =>
<T>(value: Parser<T>): Parser<T[]> =>
new Parser(state => {
if (state.hasError) return state
const res: T[] = []
let nextState = state
while (true) {
const valueState = value.parserFn(nextState)
if (valueState.hasError) break
res.push(valueState.result as T)
nextState = valueState
const sepState = sep.parserFn(nextState)
if (sepState.hasError) break
nextState = sepState
}
return { ...nextState, result: res }
})
/**
* 惰性求值来处理语法中的循环依赖.
*/
const lazy = <T>(thunk: () => Parser<T>) => new Parser(state => thunk().parserFn(state))
const Whitespace = regExp(/^\s/)
const Ignored = zeroOrMore(Whitespace)
/**
* 如果遇到前面有空白符号,则在匹配之后丢弃.
*/
const token = <S extends string>(s: S): Parser<S> =>
seqOf(Ignored, str(s)).map<S>(([_, res]) => res)
/**
* 如果遇到前面有空白符号,则在匹配之后丢弃.
*/
const regexToken = (pattern: RegExp): Parser<string> =>
seqOf(Ignored, regExp(pattern)).map(([_, res]) => res)
/** 匹配小括号()间的内容. */
const betweenParens = between(token('('), token(')'))
/** 匹配中括号[]间的内容. */
const betweenBrackets = between(token('['), token(']'))
/** 匹配大括号{}间的内容. */
const betweenBraces = between(token('{'), token('}'))
/** 只支持双引号(“),单引号(‘)和反引号(`)不支持. */
const StringLiteral = regexToken(/^"[^"]*"/)
/** 数字为十进制,支持正负和小数. */
const NumberLiteral = regexToken(/^[+-]?[0-9]+(\.[0-9]*)?/)
/** 小写英文字母. */
const LowercaseLiteral = regexToken(/^[a-z]+/)
/** 大写英文字母. */
const UpperCaseLiteral = regexToken(/^[A-Z]+/)
export {
Parser,
str,
regExp,
zeroOrMore,
zeroOrOne,
oneOrMore,
oneOf,
seqOf,
between,
sepBy,
lazy,
Whitespace,
Ignored,
token,
regexToken,
betweenParens,
betweenBrackets,
betweenBraces,
StringLiteral,
NumberLiteral,
LowercaseLiteral,
UpperCaseLiteral
}
if (require.main === module) {
// 1. prog = (functionDecl | functionCall)* ;
// 2. functionDecl: "function" Identifier "(" ")" functionBody;
// 3. functionBody : '{' functionCall* '}' ;
// 4. functionCall : Identifier '(' parameterList? ')' ;
// 5. parameterList : StringLiteral (',' StringLiteral)* ;
// 6. expression: primary (binOP primary)* ;
// 7. primary: StringLiteral | DecimalLiteral | IntegerLiteral | functionCall | '(' expression ')' ;
// !无论我们如何调整这些规则实现的先后顺序,都会遇到“在定义前使用”的编译错误。这可以通过惰性求值来避免:
// 1. prog = (functionDecl | functionCall)* ;
const prog = lazy(() => zeroOrMore(oneOf(functionDecl, functionCall))).map(stmts => ({
type: 'prog',
stmts
}))
// 2. functionDecl: "function" Identifier "(" ")" functionBody;
const functionDecl = lazy(() =>
seqOf(token('function'), Identifier, token('('), token(')'), functionBody)
).map(([_, name, _lp, _rp, body]) => ({ type: 'functionDecl', name, body }))
// 3. functionBody : '{' functionCall* '}' ;
const functionBody = lazy(() => seqOf(token('{'), zeroOrMore(functionCall), token('}'))).map(
([_lb, calls, _rb]) => calls
)
// 4. functionCall : Identifier '(' parameterList? ')' ;
const functionCall = lazy(() =>
seqOf(Identifier, token('('), zeroOrOne(parameterList), token(')'))
).map(([name, _lp, params, _rp]) => ({ type: 'functionCall', name, params }))
// 5. parameterList : StringLiteral (',' StringLiteral)* ;
const parameterList = lazy(() =>
seqOf(StringLiteral, zeroOrMore(seqOf(token(','), StringLiteral)))
).map(([param, params]) => [param, ...params.map(([_comma, param]: unknown[]) => param)])
/** 终结符Identifier */
const Identifier = regexToken(/^[a-zA-Z_][a-zA-Z0-9_]*/)
test()
function test() {
const res3 = prog.parse(`
function foo() {
println("in foo...")
}
function bar() {
println("in bar...")
foo()
}
bar()`)
console.log(JSON.stringify(res3, null, 2))
}
testSepBy()
function testSepBy(): void {
const p = sepBy(token(','))
const res = p(StringLiteral).parse('"a", "b", "c"')
console.log(res)
}
}