Skip to content

Commit

Permalink
testing: Better support for Unicode and whitespace characters (#181)
Browse files Browse the repository at this point in the history
testing: Better support for Unicode and white space characters

Unicode characters can be placed in tests as regular characters.
However, due to editors/IDEs sometimes handling white space in different
ways you can add a placeholder for a specific Unicode point using
`<U+####>`.

This will be replaced with the correct character before the test runs.
This is only a feature of SQL Tests, so will not work in any other
context.

The SQL standard names specific white space characters that must be
valid separators, we already supported this but now they are codified
into tests using the new syntax above.
  • Loading branch information
elliotchance committed Dec 22, 2023
1 parent 9106d14 commit 784da6b
Show file tree
Hide file tree
Showing 5 changed files with 199 additions and 16 deletions.
22 changes: 22 additions & 0 deletions docs/testing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,28 @@ directive:
-- EXPLAIN: TABLE FOO.PUBLIC.BAR (BAZ INTEGER)
-- EXPLAIN: EXPR (FOO.PUBLIC.BAR.BAZ INTEGER)
Unicode and Whitespace Characters
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Unicode characters can be placed in tests as regular characters:

.. code-block:: sql
VALUES '✌️';
-- That's a peach sign (or the logo for V) if the character cannot be read.
However, due to editors/IDEs sometimes handling whitespace in different ways
you can add a placeholder for a specific Unicode point using ``<U+####>``:

.. code-block:: sql
VALUES<U+0009>'hi';
-- U+0009 is a horizontal tab, equal to \t in most languages.
This will be replaced with the correct character before the test runs.

This is only a feature of SQL Tests, so will not work in any other context.

Debugging Tests
---------------

Expand Down
8 changes: 8 additions & 0 deletions tests/unicode.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
VALUES '✌️';
-- COL1: ✌️

VALUES '';
-- COL1: ぁ

VALUES 'a <U+2022> character';
-- COL1: a • character
122 changes: 122 additions & 0 deletions tests/whitespace.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
-- # The only character that is a member of the Unicode General Category class
-- # "Zl" is U+2028 (Line Separator)
VALUES<U+2028>123;
-- COL1: 123

-- # The only character that is a member of the Unicode General Category class
-- # "Zp" is U+2029 (Paragraph Separator).
VALUES<U+2029>123;
-- COL1: 123

-- # The characters that are members of the Unicode General Category class "Zs"
-- # are:

-- # U+0020 (Space)
VALUES<U+0020>123;
-- COL1: 123

-- # U+00A0 (No-Break Space)
VALUES<U+00A0>123;
-- COL1: 123

-- # U+1680 (Ogham Space Mark)
VALUES<U+1680>123;
-- COL1: 123

-- # U+180E (Mongolian Vowel Separator)
VALUES<U+180E>123;
-- COL1: 123

-- # U+2000 (En Quad)
VALUES<U+2000>123;
-- COL1: 123

-- # U+2001 (Em Quad)
VALUES<U+2001>123;
-- COL1: 123

-- # U+2002 (En Space)
VALUES<U+2002>123;
-- COL1: 123

-- # U+2003 (Em Space)
VALUES<U+2003>123;
-- COL1: 123

-- # U+2004 (Three-Per-Em Space)
VALUES<U+2004>123;
-- COL1: 123

-- # U+2005 (Four-Per-Em Space)
VALUES<U+2005>123;
-- COL1: 123

-- # U+2006 (Six-Per-Em Space)
VALUES<U+2006>123;
-- COL1: 123

-- # U+2007 (Figure Space)
VALUES<U+2007>123;
-- COL1: 123

-- # U+2008 (Punctuation Space)
VALUES<U+2008>123;
-- COL1: 123

-- # U+2009 (Thin Space)
VALUES<U+2009>123;
-- COL1: 123

-- # U+200A (Hair Space)
VALUES<U+200A>123;
-- COL1: 123

-- # U+202F (Narrow No-Break Space)
VALUES<U+202F>123;
-- COL1: 123

-- # U+205F (Space, Medium Mathematical)
VALUES<U+205F>123;
-- COL1: 123

-- # U+3000 (Ideographic Space)
VALUES<U+3000>123;
-- COL1: 123

-- # White space is any character in the Unicode General Category classes "Zs",
-- # "Zl", and "Zp", as well as any of the following characters:

-- # U+0009, Horizontal Tabulation
VALUES<U+0009>123;
-- COL1: 123

-- # U+000A, Line Feed
VALUES<U+000A>123;
-- COL1: 123

-- # U+000B, Vertical Tabulation
VALUES<U+000B>123;
-- COL1: 123

-- # U+000C, Form Feed
VALUES<U+000C>123;
-- COL1: 123

-- # U+000D, Carriage Return
VALUES<U+000D>123;
-- COL1: 123

-- # U+0085, Next Line
VALUES<U+0085>123;
-- COL1: 123

-- # Some other combinations and error conditions.

VALUES<U+0009><U+0009><U+0009>123;
-- COL1: 123

<U+0009>VALUES 123;
-- COL1: 123

VALUES<U+0061>123;
-- error 42601: syntax error: near "VALUESa123"
18 changes: 6 additions & 12 deletions vsql/lexer.v
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,7 @@ fn tokenize(sql_stmt string) []Token {
mut i := 0

next: for i < cs.len {
// space
if cs[i] == ` ` {
i++
continue
}

// numbers
// Numbers
if cs[i] >= `0` && cs[i] <= `9` {
mut word := ''
for i < cs.len && cs[i] >= `0` && cs[i] <= `9` {
Expand All @@ -58,7 +52,7 @@ fn tokenize(sql_stmt string) []Token {
continue
}

// strings
// Strings
if cs[i] == `'` {
mut word := ''
i++
Expand All @@ -71,7 +65,7 @@ fn tokenize(sql_stmt string) []Token {
continue
}

// delimited identifiers
// Delimited identifiers
if cs[i] == `"` {
mut word := ''
i++
Expand All @@ -84,7 +78,7 @@ fn tokenize(sql_stmt string) []Token {
continue
}

// operators
// Operators
multi := {
'<>': TokenKind.not_equals_operator
'>=': TokenKind.greater_than_or_equals_operator
Expand Down Expand Up @@ -122,7 +116,7 @@ fn tokenize(sql_stmt string) []Token {
}
}

// keyword or regular identifier
// Keyword or regular identifier
mut word := ''
mut is_not_first := false
for i < cs.len && is_identifier_char(cs[i], is_not_first) {
Expand All @@ -147,7 +141,7 @@ fn tokenize(sql_stmt string) []Token {
}

@[inline]
fn is_identifier_char(c u8, is_not_first bool) bool {
fn is_identifier_char(c rune, is_not_first bool) bool {
yes := (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) || c == `_`

if is_not_first {
Expand Down
45 changes: 41 additions & 4 deletions vsql/sql_test.v
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module vsql

import os
import regex
import time

struct SQLTest {
Expand Down Expand Up @@ -56,9 +57,9 @@ fn get_tests() ![]SQLTest {
if contents == 'setup' {
in_setup = true
} else if contents.starts_with('connection ') {
stmts << contents
stmts << replace_unicode(contents)
} else if contents.starts_with('create_catalog ') {
stmts << contents
stmts << replace_unicode(contents)
} else if contents.starts_with('set ') {
parts := contents.split(' ')
if parts[2].starts_with("'") {
Expand All @@ -73,20 +74,21 @@ fn get_tests() ![]SQLTest {
panic('bad directive: "${contents}"')
}
} else if line.starts_with('-- #') {
line_number++
continue
} else if line.starts_with('-- ') {
expected << line[3..]
} else {
if in_setup {
setup_stmt += '\n${line}'
if line.ends_with(';') {
setup << setup_stmt
setup << replace_unicode(setup_stmt)
setup_stmt = ''
}
} else {
stmt += '\n${line}'
if line.ends_with(';') {
stmts << stmt
stmts << replace_unicode(stmt)
stmt = ''
}
}
Expand All @@ -103,6 +105,41 @@ fn get_tests() ![]SQLTest {
return tests
}

fn replace_unicode(s string) string {
replace_func := fn (re regex.RE, unicode_point string, start int, end int) string {
hex_chars := {
`0`: 0
`1`: 1
`2`: 2
`3`: 3
`4`: 4
`5`: 5
`6`: 6
`7`: 7
`8`: 8
`9`: 9
`a`: 10
`b`: 11
`c`: 12
`d`: 13
`e`: 14
`f`: 15
`A`: 10
`B`: 11
`C`: 12
`D`: 13
`E`: 14
`F`: 15
}
return rune(hex_chars[unicode_point[start + 3]] * 4096 + hex_chars[unicode_point[start +
4]] * 256 + hex_chars[unicode_point[start + 5]] * 16 + hex_chars[unicode_point[start +
6]]).str()
}

mut re := regex.regex_opt(r'<U\+[0-9A-Fa-f]{4}>') or { panic(err) }
return re.replace_by_fn(s, replace_func)
}

fn test_all() ! {
filter_test, filter_line := get_test_filter()
verbose := $env('VERBOSE')
Expand Down

0 comments on commit 784da6b

Please sign in to comment.