testing: Better support for Unicode and whitespace characters (#181)

testing: Better support for Unicode and white space characters Unicode characters can be placed in tests as regular characters. However, due to editors/IDEs sometimes handling white space in different ways you can add a placeholder for a specific Unicode point using `<U+####>`. This will be replaced with the correct character before the test runs. This is only a feature of SQL Tests, so will not work in any other context. The SQL standard names specific white space characters that must be valid separators, we already supported this but now they are codified into tests using the new syntax above.
elliotchance · Dec 22, 2023 · 784da6b · 784da6b
1 parent 9106d14
commit 784da6b
Show file tree

Hide file tree

Showing 5 changed files with 199 additions and 16 deletions.
diff --git a/docs/testing.rst b/docs/testing.rst
@@ -300,6 +300,28 @@ directive:
    -- EXPLAIN: TABLE FOO.PUBLIC.BAR (BAZ INTEGER)
    -- EXPLAIN: EXPR (FOO.PUBLIC.BAR.BAZ INTEGER)
 
+Unicode and Whitespace Characters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Unicode characters can be placed in tests as regular characters:
+
+.. code-block:: sql
+
+   VALUES '✌️';
+   -- That's a peach sign (or the logo for V) if the character cannot be read.
+
+However, due to editors/IDEs sometimes handling whitespace in different ways
+you can add a placeholder for a specific Unicode point using ``<U+####>``:
+
+.. code-block:: sql
+
+   VALUES<U+0009>'hi';
+   -- U+0009 is a horizontal tab, equal to \t in most languages.
+
+This will be replaced with the correct character before the test runs.
+
+This is only a feature of SQL Tests, so will not work in any other context.
+
 Debugging Tests
 ---------------
 

diff --git a/tests/unicode.sql b/tests/unicode.sql
@@ -0,0 +1,8 @@
+VALUES '✌️';
+-- COL1: ✌️
+
+VALUES 'ぁ';
+-- COL1: ぁ
+
+VALUES 'a <U+2022> character';
+-- COL1: a • character
diff --git a/tests/whitespace.sql b/tests/whitespace.sql
@@ -0,0 +1,122 @@
+-- # The only character that is a member of the Unicode General Category class
+-- # "Zl" is U+2028 (Line Separator)
+VALUES<U+2028>123;
+-- COL1: 123
+
+-- # The only character that is a member of the Unicode General Category class
+-- # "Zp" is U+2029 (Paragraph Separator).
+VALUES<U+2029>123;
+-- COL1: 123
+
+-- # The characters that are members of the Unicode General Category class "Zs"
+-- # are:
+
+-- # U+0020 (Space)
+VALUES<U+0020>123;
+-- COL1: 123
+
+-- # U+00A0 (No-Break Space)
+VALUES<U+00A0>123;
+-- COL1: 123
+
+-- # U+1680 (Ogham Space Mark)
+VALUES<U+1680>123;
+-- COL1: 123
+
+-- # U+180E (Mongolian Vowel Separator)
+VALUES<U+180E>123;
+-- COL1: 123
+
+-- # U+2000 (En Quad)
+VALUES<U+2000>123;
+-- COL1: 123
+
+-- # U+2001 (Em Quad)
+VALUES<U+2001>123;
+-- COL1: 123
+
+-- # U+2002 (En Space)
+VALUES<U+2002>123;
+-- COL1: 123
+
+-- # U+2003 (Em Space)
+VALUES<U+2003>123;
+-- COL1: 123
+
+-- # U+2004 (Three-Per-Em Space)
+VALUES<U+2004>123;
+-- COL1: 123
+
+-- # U+2005 (Four-Per-Em Space)
+VALUES<U+2005>123;
+-- COL1: 123
+
+-- # U+2006 (Six-Per-Em Space)
+VALUES<U+2006>123;
+-- COL1: 123
+
+-- # U+2007 (Figure Space)
+VALUES<U+2007>123;
+-- COL1: 123
+
+-- # U+2008 (Punctuation Space)
+VALUES<U+2008>123;
+-- COL1: 123
+
+-- # U+2009 (Thin Space)
+VALUES<U+2009>123;
+-- COL1: 123
+
+-- # U+200A (Hair Space)
+VALUES<U+200A>123;
+-- COL1: 123
+
+-- # U+202F (Narrow No-Break Space)
+VALUES<U+202F>123;
+-- COL1: 123
+
+-- # U+205F (Space, Medium Mathematical)
+VALUES<U+205F>123;
+-- COL1: 123
+
+-- # U+3000 (Ideographic Space)
+VALUES<U+3000>123;
+-- COL1: 123
+
+-- # White space is any character in the Unicode General Category classes "Zs",
+-- # "Zl", and "Zp", as well as any of the following characters:
+
+-- # U+0009, Horizontal Tabulation
+VALUES<U+0009>123;
+-- COL1: 123
+
+-- # U+000A, Line Feed
+VALUES<U+000A>123;
+-- COL1: 123
+
+-- # U+000B, Vertical Tabulation
+VALUES<U+000B>123;
+-- COL1: 123
+
+-- # U+000C, Form Feed
+VALUES<U+000C>123;
+-- COL1: 123
+
+-- # U+000D, Carriage Return
+VALUES<U+000D>123;
+-- COL1: 123
+
+-- # U+0085, Next Line
+VALUES<U+0085>123;
+-- COL1: 123
+
+-- # Some other combinations and error conditions.
+
+VALUES<U+0009><U+0009><U+0009>123;
+-- COL1: 123
+
+<U+0009>VALUES 123;
+-- COL1: 123
+
+VALUES<U+0061>123;
+-- error 42601: syntax error: near "VALUESa123"
diff --git a/vsql/lexer.v b/vsql/lexer.v
@@ -41,13 +41,7 @@ fn tokenize(sql_stmt string) []Token {
 	mut i := 0
 
 	next: for i < cs.len {
-		// space
-		if cs[i] == ` ` {
-			i++
-			continue
-		}
-
-		// numbers
+		// Numbers
 		if cs[i] >= `0` && cs[i] <= `9` {
 			mut word := ''
 			for i < cs.len && cs[i] >= `0` && cs[i] <= `9` {
@@ -58,7 +52,7 @@ fn tokenize(sql_stmt string) []Token {
 			continue
 		}
 
-		// strings
+		// Strings
 		if cs[i] == `'` {
 			mut word := ''
 			i++
@@ -71,7 +65,7 @@ fn tokenize(sql_stmt string) []Token {
 			continue
 		}
 
-		// delimited identifiers
+		// Delimited identifiers
 		if cs[i] == `"` {
 			mut word := ''
 			i++
@@ -84,7 +78,7 @@ fn tokenize(sql_stmt string) []Token {
 			continue
 		}
 
-		// operators
+		// Operators
 		multi := {
 			'<>': TokenKind.not_equals_operator
 			'>=': TokenKind.greater_than_or_equals_operator
@@ -122,7 +116,7 @@ fn tokenize(sql_stmt string) []Token {
 			}
 		}
 
-		// keyword or regular identifier
+		// Keyword or regular identifier
 		mut word := ''
 		mut is_not_first := false
 		for i < cs.len && is_identifier_char(cs[i], is_not_first) {
@@ -147,7 +141,7 @@ fn tokenize(sql_stmt string) []Token {
 }
 
 @[inline]
-fn is_identifier_char(c u8, is_not_first bool) bool {
+fn is_identifier_char(c rune, is_not_first bool) bool {
 	yes := (c >= `a` && c <= `z`) || (c >= `A` && c <= `Z`) || c == `_`
 
 	if is_not_first {

diff --git a/vsql/sql_test.v b/vsql/sql_test.v
@@ -1,6 +1,7 @@
 module vsql
 
 import os
+import regex
 import time
 
 struct SQLTest {
@@ -56,9 +57,9 @@ fn get_tests() ![]SQLTest {
 				if contents == 'setup' {
 					in_setup = true
 				} else if contents.starts_with('connection ') {
-					stmts << contents
+					stmts << replace_unicode(contents)
 				} else if contents.starts_with('create_catalog ') {
-					stmts << contents
+					stmts << replace_unicode(contents)
 				} else if contents.starts_with('set ') {
 					parts := contents.split(' ')
 					if parts[2].starts_with("'") {
@@ -73,20 +74,21 @@ fn get_tests() ![]SQLTest {
 					panic('bad directive: "${contents}"')
 				}
 			} else if line.starts_with('-- #') {
+				line_number++
 				continue
 			} else if line.starts_with('-- ') {
 				expected << line[3..]
 			} else {
 				if in_setup {
 					setup_stmt += '\n${line}'
 					if line.ends_with(';') {
-						setup << setup_stmt
+						setup << replace_unicode(setup_stmt)
 						setup_stmt = ''
 					}
 				} else {
 					stmt += '\n${line}'
 					if line.ends_with(';') {
-						stmts << stmt
+						stmts << replace_unicode(stmt)
 						stmt = ''
 					}
 				}
@@ -103,6 +105,41 @@ fn get_tests() ![]SQLTest {
 	return tests
 }
 
+fn replace_unicode(s string) string {
+	replace_func := fn (re regex.RE, unicode_point string, start int, end int) string {
+		hex_chars := {
+			`0`: 0
+			`1`: 1
+			`2`: 2
+			`3`: 3
+			`4`: 4
+			`5`: 5
+			`6`: 6
+			`7`: 7
+			`8`: 8
+			`9`: 9
+			`a`: 10
+			`b`: 11
+			`c`: 12
+			`d`: 13
+			`e`: 14
+			`f`: 15
+			`A`: 10
+			`B`: 11
+			`C`: 12
+			`D`: 13
+			`E`: 14
+			`F`: 15
+		}
+		return rune(hex_chars[unicode_point[start + 3]] * 4096 + hex_chars[unicode_point[start +
+			4]] * 256 + hex_chars[unicode_point[start + 5]] * 16 + hex_chars[unicode_point[start +
+			6]]).str()
+	}
+
+	mut re := regex.regex_opt(r'<U\+[0-9A-Fa-f]{4}>') or { panic(err) }
+	return re.replace_by_fn(s, replace_func)
+}
+
 fn test_all() ! {
 	filter_test, filter_line := get_test_filter()
 	verbose := $env('VERBOSE')