Skip to content

Commit

Permalink
59: add support for \uXXXX escapes within string literals
Browse files Browse the repository at this point in the history
This is in response to edn-format/edn#65 .

This is an extension as string literals as currently documented
do not specify support for \uXXXX escapes.

  https://github.com/edn-format/edn/tree/a51127aecd318096667ae0dafa25353ecb07c9c3

Syntax Notes:

- Unicode escape must begin with "\u". This is case sensitive "\U" will
  be rejected.
- "\u" must be followed by exactly four hex digits taken from this set:
  0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F
- The digits are not case sensitive.
- Each such Unicode escape encodes a single 16-bit Java char. Since Java
  uses UTF-16 internally (for historical reasons) code points beyond
  the basic multilingual plane as a pair of unicode escapes.
  (see also "surrogate pairs")

Disabling:

By default \uXXXX escapes are now supported in String literals.

Parser.Config (and Parser.Config.Builder) now support a flag which can
be set to false to disable support for \uXXXX in string literals. This
restores the old behavior of throwing an EdnSyntaxException when such
escapes are encountered.
  • Loading branch information
bpsm committed May 1, 2020
1 parent a489fb1 commit 114ca4e
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 3 deletions.
29 changes: 29 additions & 0 deletions src/main/java/us/bpsm/edn/parser/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,25 @@ public interface Config {
*/
public TagHandler getTagHandler(Tag tag);

/**
* When true, the parser will accept ∖uXXXX escape sequences in string
* literals and replace them with the corresponding java char in the
* parsed string. When false, such escape sequences will throw an.
* <p>
* The default is true, which is not in strict accodance with the
* letter of edn-format/README, but:
* <ul>
* <li>Clojure's own edn reader behaves in this way.</li>
* <li>Character literals do allow this syntax according to
* edn-format/README</li>
* </ul>
* {@link EdnSyntaxException}.
* @return
*/
public default boolean unicodeEscapesInStringLiteralsAreAccepted() {
return true;
}

/**
* This Builder is used to create a {@link Parser.Config}.
* Fresh Builder instances are provided by
Expand Down Expand Up @@ -326,6 +345,16 @@ public interface Builder {
*/
public Builder putTagHandler(Tag tag, TagHandler handler);

/**
* Toggle the Parser's willingness to accept unicode escapes
* in string literals. By default unicode escapes will be
* accepted.
* {@link Config#unicodeEscapesInStringLiteralsAreAccepted()}
*/
public Builder acceptUnicodeEscapesInStringLiterals(
boolean acceptUnicodeEscapes
);

/**
* Build and return the {@link Config} described by the
* sequence of calls made on this Builder. Calling
Expand Down
13 changes: 13 additions & 0 deletions src/main/java/us/bpsm/edn/parser/Parsers.java
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,7 @@ public void unread(int ch) throws IOException {
public static Builder newParserConfigBuilder() {
return new Builder() {
boolean used = false;
boolean acceptUnicodeEscapes = true;
CollectionBuilder.Factory listFactory = DEFAULT_LIST_FACTORY;
CollectionBuilder.Factory vectorFactory = DEFAULT_VECTOR_FACTORY;
CollectionBuilder.Factory setFactory = DEFAULT_SET_FACTORY;
Expand Down Expand Up @@ -232,6 +233,13 @@ public Builder putTagHandler(Tag tag, TagHandler handler) {
return this;
}

@Override
public Builder acceptUnicodeEscapesInStringLiterals(boolean acceptUnicodeEscapes) {
checkState();
this.acceptUnicodeEscapes = acceptUnicodeEscapes;
return this;
}

public Config build() {
checkState();
used = true;
Expand All @@ -255,6 +263,11 @@ public Factory getMapFactory() {
public TagHandler getTagHandler(Tag tag) {
return tagHandlers.get(tag);
}

@Override
public boolean unicodeEscapesInStringLiteralsAreAccepted() {
return acceptUnicodeEscapes;
}
};
}

Expand Down
32 changes: 29 additions & 3 deletions src/main/java/us/bpsm/edn/parser/ScannerImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
import static us.bpsm.edn.parser.Parser.Config.BIG_INTEGER_TAG;
import static us.bpsm.edn.parser.Parser.Config.DOUBLE_TAG;
import static us.bpsm.edn.parser.Parser.Config.LONG_TAG;
import static us.bpsm.edn.util.CharClassify.isDigit;
import static us.bpsm.edn.util.CharClassify.isWhitespace;
import static us.bpsm.edn.util.CharClassify.separatesTokens;
import static us.bpsm.edn.util.CharClassify.*;

import java.io.IOException;
import java.math.BigDecimal;
Expand All @@ -35,6 +33,7 @@ class ScannerImpl implements Scanner {
private final TagHandler bigDecimalHandler;
private final TagHandler bigIntegerHandler;
private final TagHandler doubleHandler;
private final boolean unicodeEscapesInStringLiteralsAreAccepted;

/**
* Scanner may throw an IOException during construction, in which case
Expand All @@ -50,6 +49,8 @@ class ScannerImpl implements Scanner {
this.bigIntegerHandler = cfg.getTagHandler(BIG_INTEGER_TAG);
this.doubleHandler = cfg.getTagHandler(DOUBLE_TAG);
this.bigDecimalHandler = cfg.getTagHandler(BIG_DECIMAL_TAG);
this.unicodeEscapesInStringLiteralsAreAccepted =
cfg.unicodeEscapesInStringLiteralsAreAccepted();
}

/* (non-Javadoc)
Expand Down Expand Up @@ -377,6 +378,31 @@ private String readStringLiteral(Parseable pbr) throws IOException {
case '\\':
b.append('\\');
break;
case 'u':
if (!unicodeEscapesInStringLiteralsAreAccepted) {
throw new EdnSyntaxException(
"Unsupported '" + ((char) curr)
+ "' escape in string. "
+ "(Unicode escapes disabled by Parser.Config)"
);
}
/*
2020-05-01 Support for reading unicode escapes within
string literals is an extension to EDN. It is not part of
the spec described here: https://github.com/edn-format/edn
*/
int v = 0;
for (int i = 0; i < 4; i++) {
curr = pbr.read();
int d = Character.digit(curr, 16);
if (d == -1) {
throw new EdnSyntaxException(
"Invalid \\u Unicode escape in string.");
}
v = v * 16 + d;
}
b.append((char)v);
break;
default:
throw new EdnSyntaxException("Unsupported '"+ ((char)curr)
+"' escape in string");
Expand Down
30 changes: 30 additions & 0 deletions src/test/java/us/bpsm/edn/parser/ScannerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import org.junit.Test;

import us.bpsm.edn.EdnException;
import us.bpsm.edn.EdnSyntaxException;
import us.bpsm.edn.Keyword;
import us.bpsm.edn.Symbol;
import us.bpsm.edn.parser.Parseable;
Expand Down Expand Up @@ -366,6 +367,35 @@ public void unicodeEscapeCharacterLiterals() {
assertEquals(c, s.nextToken(pbr));
}

@Test
public void unicodeEscapesInStringLiterals() {
String txt = "\"" +
"\\" + "u0000" +
"\\" + "u1234" +
"\\" + "u0Ff0" +
"\"";
String expected = "\u0000\u1234\u0Ff0";
assertEquals(3, expected.length());
Parseable pbr = Parsers.newParseable(txt);
Scanner s = scanner();
assertEquals(expected, s.nextToken(pbr));
}

@Test(expected = EdnSyntaxException.class)
public void truncatedUnicodeEscapeInStringLiteral() {
scanner().nextToken(Parsers.newParseable("\"\\" + "u123\""));
}

@Test(expected = EdnSyntaxException.class)
public void truncatedInputInUnicodeEscapeInStringLiteral() {
scanner().nextToken(Parsers.newParseable("\"\\" + "u123"));
}

@Test(expected = EdnSyntaxException.class)
public void nonDigitInUnicodeEscapeInStringLiteral() {
scanner().nextToken(Parsers.newParseable("\"\\" + "u123?\""));
}

@Test
public void simpleStringWithLinebreak() {
assertEquals("\n", scan("\"\n\""));
Expand Down

0 comments on commit 114ca4e

Please sign in to comment.