From ddb09d22a737deea0d0a9ab58cd5d337164bbbfe Mon Sep 17 00:00:00 2001 From: David Symonds Date: Tue, 4 May 2021 10:44:30 +1000 Subject: [PATCH] feat(spanner/spansql): case insensitive parsing of keywords and functions (#4034) This handles the most important parts of the parser as it relates to case insensitivity, namely permitting the lexical tokens for the grammar (as opposed to elements like table/column names) to have arbitrary case. This includes a complete audit of the parser to check all the lexical token handling, and cleaned up a few places that were particularly ancient. The rendered output remains canonically uppercase. Fixes #4032. --- spanner/spannertest/README.md | 2 +- spanner/spansql/parser.go | 138 ++++++++++++++++----------------- spanner/spansql/parser_test.go | 4 +- 3 files changed, 70 insertions(+), 74 deletions(-) diff --git a/spanner/spannertest/README.md b/spanner/spannertest/README.md index 14caba869b5..6cd4f0a1fe0 100644 --- a/spanner/spannertest/README.md +++ b/spanner/spannertest/README.md @@ -21,12 +21,12 @@ by ascending esotericism: - NUMERIC - more aggregation functions - SELECT HAVING -- case insensitivity - more literal types - generated columns - expression type casting, coercion - multiple joins - subselects +- case insensitivity of table and column names and query aliases - transaction simulation - FOREIGN KEY and CHECK constraints - INSERT DML statements diff --git a/spanner/spansql/parser.go b/spanner/spansql/parser.go index 674c54ae019..b4513ebcd58 100644 --- a/spanner/spansql/parser.go +++ b/spanner/spansql/parser.go @@ -885,6 +885,13 @@ func (p *parser) next() *token { return &p.cur } +// caseEqual reports whether the token is valid, not a quoted identifier, and +// equal to the provided string under a case insensitive comparison. +// Use this (or sniff/eat/expect) instead of comparing a string directly for keywords, etc. +func (t *token) caseEqual(x string) bool { + return t.err == nil && t.typ != quotedID && strings.EqualFold(t.value, x) +} + // sniff reports whether the next N tokens are as specified. func (p *parser) sniff(want ...string) bool { // Store current parser state and restore on the way out. @@ -892,8 +899,7 @@ func (p *parser) sniff(want ...string) bool { defer func() { *p = orig }() for _, w := range want { - tok := p.next() - if tok.err != nil || tok.value != w { + if !p.next().caseEqual(w) { return false } } @@ -907,8 +913,7 @@ func (p *parser) eat(want ...string) bool { orig := *p for _, w := range want { - tok := p.next() - if tok.err != nil || tok.value != w { + if !p.next().caseEqual(w) { // Mismatch. *p = orig return false @@ -922,7 +927,7 @@ func (p *parser) expect(want string) *parseError { if tok.err != nil { return tok.err } - if tok.value != want { + if !tok.caseEqual(want) { return p.errorf("got %q while expecting %q", tok.value, want) } return nil @@ -958,18 +963,22 @@ func (p *parser) parseDDLStmt() (DDLStmt, *parseError) { if tok.err != nil { return nil, tok.err } - kind := tok.value - if kind != "TABLE" && kind != "INDEX" { - return nil, p.errorf("got %q, want TABLE or INDEX", kind) - } - name, err := p.parseTableOrIndexOrColumnName() - if err != nil { - return nil, err - } - if kind == "TABLE" { + switch { + default: + return nil, p.errorf("got %q, want TABLE or INDEX", tok.value) + case tok.caseEqual("TABLE"): + name, err := p.parseTableOrIndexOrColumnName() + if err != nil { + return nil, err + } return &DropTable{Name: name, Position: pos}, nil + case tok.caseEqual("INDEX"): + name, err := p.parseTableOrIndexOrColumnName() + if err != nil { + return nil, err + } + return &DropIndex{Name: name, Position: pos}, nil } - return &DropIndex{Name: name, Position: pos}, nil } return nil, p.errorf("unknown DDL statement") @@ -1200,10 +1209,10 @@ func (p *parser) parseAlterTable() (*AlterTable, *parseError) { if tok.err != nil { return nil, tok.err } - switch tok.value { + switch { default: return nil, p.errorf("got %q, expected ADD or DROP or SET or ALTER", tok.value) - case "ADD": + case tok.caseEqual("ADD"): if p.sniff("CONSTRAINT") || p.sniff("FOREIGN") || p.sniff("CHECK") { tc, err := p.parseTableConstraint() if err != nil { @@ -1223,7 +1232,7 @@ func (p *parser) parseAlterTable() (*AlterTable, *parseError) { } a.Alteration = AddColumn{Def: cd} return a, nil - case "DROP": + case tok.caseEqual("DROP"): if p.eat("CONSTRAINT") { name, err := p.parseTableOrIndexOrColumnName() if err != nil { @@ -1243,7 +1252,7 @@ func (p *parser) parseAlterTable() (*AlterTable, *parseError) { } a.Alteration = DropColumn{Name: name} return a, nil - case "SET": + case tok.caseEqual("SET"): if err := p.expect("ON"); err != nil { return nil, err } @@ -1256,7 +1265,7 @@ func (p *parser) parseAlterTable() (*AlterTable, *parseError) { } a.Alteration = SetOnDelete{Action: od} return a, nil - case "ALTER": + case tok.caseEqual("ALTER"): // TODO: "COLUMN" is optional. if err := p.expect("COLUMN"); err != nil { return nil, err @@ -1459,6 +1468,8 @@ func (p *parser) parseColumnOptions() (ColumnOptions, *parseError) { return ColumnOptions{}, err } + // TODO: Figure out if column options are case insensitive. + // We ignore case for the key (because it is easier) but not the value. var co ColumnOptions if p.eat("allow_commit_timestamp", "=") { tok := p.next() @@ -1512,18 +1523,10 @@ func (p *parser) parseKeyPart() (KeyPart, *parseError) { kp := KeyPart{Column: name} - tok := p.next() - if tok.err != nil { - // End of the key_part. - p.back() - return kp, nil - } - switch tok.value { - case "ASC": - case "DESC": + if p.eat("ASC") { + // OK. + } else if p.eat("DESC") { kp.Desc = true - default: - p.back() } return kp, nil @@ -1679,7 +1682,7 @@ func (p *parser) parseType() (Type, *parseError) { if tok.err != nil { return Type{}, tok.err } - if tok.value == "ARRAY" { + if tok.caseEqual("ARRAY") { t.Array = true if err := p.expect("<"); err != nil { return Type{}, err @@ -1689,7 +1692,7 @@ func (p *parser) parseType() (Type, *parseError) { return Type{}, tok.err } } - base, ok := baseTypes[tok.value] + base, ok := baseTypes[strings.ToUpper(tok.value)] // baseTypes is keyed by upper case strings. if !ok { return Type{}, p.errorf("got %q, want scalar type", tok.value) } @@ -1704,7 +1707,7 @@ func (p *parser) parseType() (Type, *parseError) { if tok.err != nil { return Type{}, tok.err } - if tok.value == "MAX" { + if tok.caseEqual("MAX") { t.Len = MaxLen } else if tok.typ == int64Token { n, err := strconv.ParseInt(tok.value, tok.int64Base, 64) @@ -1746,7 +1749,6 @@ func (p *parser) parseQuery() (Query, *parseError) { // TODO: hints, sub-selects, etc. - // TODO: use a case-insensitive select. if err := p.expect("SELECT"); err != nil { return Query{}, err } @@ -1988,7 +1990,7 @@ func (p *parser) parseSelectFrom() (SelectFrom, *parseError) { return sf, nil } var hashJoin bool // Special case for "HASH JOIN" syntax. - if tok.value == "HASH" { + if tok.caseEqual("HASH") { hashJoin = true tok = p.next() if tok.err != nil { @@ -1996,7 +1998,7 @@ func (p *parser) parseSelectFrom() (SelectFrom, *parseError) { } } var jt JoinType - if tok.value == "JOIN" { + if tok.caseEqual("JOIN") { // This is implicitly an inner join. jt = InnerJoin } else if j, ok := joinKeywords[tok.value]; ok { @@ -2103,9 +2105,9 @@ func (p *parser) parseTableSample() (TableSample, *parseError) { switch { case tok.err != nil: return ts, tok.err - case tok.value == "BERNOULLI": + case tok.caseEqual("BERNOULLI"): ts.Method = Bernoulli - case tok.value == "RESERVOIR": + case tok.caseEqual("RESERVOIR"): ts.Method = Reservoir default: return ts, p.errorf("got %q, want BERNOULLI or RESERVOIR", tok.value) @@ -2127,9 +2129,9 @@ func (p *parser) parseTableSample() (TableSample, *parseError) { switch { case tok.err != nil: return ts, tok.err - case tok.value == "PERCENT": + case tok.caseEqual("PERCENT"): ts.SizeType = PercentTableSample - case tok.value == "ROWS": + case tok.caseEqual("ROWS"): ts.SizeType = RowsTableSample default: return ts, p.errorf("got %q, want PERCENT or ROWS", tok.value) @@ -2153,13 +2155,10 @@ func (p *parser) parseOrder() (Order, *parseError) { } o := Order{Expr: expr} - tok := p.next() - switch { - case tok.err == nil && tok.value == "ASC": - case tok.err == nil && tok.value == "DESC": + if p.eat("ASC") { + // OK. + } else if p.eat("DESC") { o.Desc = true - default: - p.back() } return o, nil @@ -2352,9 +2351,7 @@ func (p *parser) parseIsOp() (Expr, *parseError) { return nil, err } - tok := p.next() - if tok.err != nil || tok.value != "IS" { - p.back() + if !p.eat("IS") { return expr, nil } @@ -2363,16 +2360,16 @@ func (p *parser) parseIsOp() (Expr, *parseError) { isOp.Neg = true } - tok = p.next() + tok := p.next() if tok.err != nil { return nil, tok.err } - switch tok.value { - case "NULL": + switch { + case tok.caseEqual("NULL"): isOp.RHS = Null - case "TRUE": + case tok.caseEqual("TRUE"): isOp.RHS = True - case "FALSE": + case tok.caseEqual("FALSE"): isOp.RHS = False default: return nil, p.errorf("got %q, want NULL or TRUE or FALSE", tok.value) @@ -2566,8 +2563,8 @@ func (p *parser) parseLit() (Expr, *parseError) { // If the literal was an identifier, and there's an open paren next, // this is a function invocation. - // TODO: Case-insensitivity. - if name := tok.value; funcs[name] && p.sniff("(") { + // The `funcs` map is keyed by upper case strings. + if name := strings.ToUpper(tok.value); funcs[name] && p.sniff("(") { list, err := p.parseParenExprList() if err != nil { return nil, err @@ -2579,30 +2576,28 @@ func (p *parser) parseLit() (Expr, *parseError) { } // Handle some reserved keywords and special tokens that become specific values. - switch tok.value { - case "TRUE": + switch { + case tok.caseEqual("TRUE"): return True, nil - case "FALSE": + case tok.caseEqual("FALSE"): return False, nil - case "NULL": + case tok.caseEqual("NULL"): return Null, nil - case "*": + case tok.value == "*": return Star, nil default: - // TODO: Check IsKeyWord(tok.value), and return a distinguished type, - // then only accept that when parsing. That will also permit - // case insensitivity for keywords. + // TODO: Check IsKeyWord(tok.value), and return a good error? } // Handle typed literals. - switch tok.value { - case "ARRAY", "[": + switch { + case tok.caseEqual("ARRAY") || tok.value == "[": p.back() return p.parseArrayLit() - case "DATE": + case tok.caseEqual("DATE"): p.back() return p.parseDateLit() - case "TIMESTAMP": + case tok.caseEqual("TIMESTAMP"): p.back() return p.parseTimestampLit() } @@ -2787,10 +2782,10 @@ func (p *parser) parseOnDelete() (OnDelete, *parseError) { if tok.err != nil { return 0, tok.err } - if tok.value == "CASCADE" { + if tok.caseEqual("CASCADE") { return CascadeOnDelete, nil } - if tok.value != "NO" { + if !tok.caseEqual("NO") { return 0, p.errorf("got %q, want NO or CASCADE", tok.value) } if err := p.expect("ACTION"); err != nil { @@ -2801,6 +2796,7 @@ func (p *parser) parseOnDelete() (OnDelete, *parseError) { // parseCommaList parses a comma-separated list enclosed by bra and ket, // delegating to f for the individual element parsing. +// Only invoke this with symbols as bra/ket; they are matched literally, not case insensitively. func (p *parser) parseCommaList(bra, ket string, f func(*parser) *parseError) *parseError { if err := p.expect(bra); err != nil { return err diff --git a/spanner/spansql/parser_test.go b/spanner/spansql/parser_test.go index 2c00fb32b32..a79e54828f6 100644 --- a/spanner/spansql/parser_test.go +++ b/spanner/spansql/parser_test.go @@ -32,7 +32,7 @@ func TestParseQuery(t *testing.T) { want Query }{ {`SELECT 17`, Query{Select: Select{List: []Expr{IntegerLiteral(17)}}}}, - {`SELECT Alias AS aka FROM Characters WHERE Age < @ageLimit AND Alias IS NOT NULL ORDER BY Age DESC LIMIT @limit OFFSET 3` + "\n\t", + {`SELECT Alias AS aka From Characters WHERE Age < @ageLimit AND Alias IS NOT NULL ORDER BY Age DESC LIMIT @limit OFFSET 3` + "\n\t", Query{ Select: Select{ List: []Expr{ID("Alias")}, @@ -410,7 +410,7 @@ func TestParseDDL(t *testing.T) { -- Table with generated column. CREATE TABLE GenCol ( Name STRING(MAX) NOT NULL, - NameLen INT64 AS (CHAR_LENGTH(Name)) STORED, + NameLen INT64 AS (char_length(Name)) STORED, ) PRIMARY KEY (Name); -- Trailing comment at end of file.