From e8d7113cbede9df9cf2f453faf410c12ee1274bf Mon Sep 17 00:00:00 2001 From: Edward Cooke Date: Mon, 28 Aug 2023 16:41:25 -0600 Subject: [PATCH 1/2] Support UTF-8 surrogates for UTF-16 and 32. --- YamlDotNet.Benchmark/Program.cs | 11 +---- YamlDotNet.Test/Core/ScannerTests.cs | 16 ++++++- YamlDotNet/Core/Scanner.cs | 63 ++++++++++++++++++++++++---- 3 files changed, 73 insertions(+), 17 deletions(-) diff --git a/YamlDotNet.Benchmark/Program.cs b/YamlDotNet.Benchmark/Program.cs index 8b88132d..9fc2ebd9 100644 --- a/YamlDotNet.Benchmark/Program.cs +++ b/YamlDotNet.Benchmark/Program.cs @@ -1,11 +1,4 @@ -using System.Globalization; -using BenchmarkDotNet.Running; +using BenchmarkDotNet.Running; using YamlDotNet.Benchmark; -using YamlDotNet.Core; -using YamlDotNet.Core.Events; -using YamlDotNet.Serialization; -using YamlDotNet.Serialization.NamingConventions; -var dateTimeOffset = new DateTimeOffset(new DateTime(2017, 1, 2, 3, 4, 5), new TimeSpan(-6, 0, 0)); -Console.WriteLine(dateTimeOffset.ToString("MM/dd/yyyy HH:mm:ss zzz", CultureInfo.InvariantCulture)); -Console.WriteLine(dateTimeOffset.ToString("O", CultureInfo.InvariantCulture)); +BenchmarkSwitcher.FromAssembly(typeof(YamlStreamBenchmark).Assembly).Run(args); diff --git a/YamlDotNet.Test/Core/ScannerTests.cs b/YamlDotNet.Test/Core/ScannerTests.cs index bc1ae17f..6ef9c159 100644 --- a/YamlDotNet.Test/Core/ScannerTests.cs +++ b/YamlDotNet.Test/Core/ScannerTests.cs @@ -1,4 +1,4 @@ -// This file is part of YamlDotNet - A .NET library for YAML. +// This file is part of YamlDotNet - A .NET library for YAML. // Copyright (c) Antoine Aubry and contributors // // Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -530,6 +530,20 @@ public void Keys_can_start_with_colons_after_double_quoted_values_in_nested_bloc StreamEnd); } + [Fact] + public void Utf16StringsAsUtf8SurrogatesWorkCorrectly() + { + AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\\uD83D\\uDC4D\""), + StreamStart, + BlockMappingStart, + Key, + PlainScalar("Test"), + Value, + DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it. + BlockEnd, + StreamEnd); + } + private void AssertPartialSequenceOfTokensFrom(Scanner scanner, params Token[] tokens) { var tokenNumber = 1; diff --git a/YamlDotNet/Core/Scanner.cs b/YamlDotNet/Core/Scanner.cs index 6d0138a9..7e743dc0 100644 --- a/YamlDotNet/Core/Scanner.cs +++ b/YamlDotNet/Core/Scanner.cs @@ -1937,19 +1937,68 @@ private Scalar ScanFlowScalar(bool isSingleQuoted) // Check the value and write the character. - if ((character >= 0xD800 && character <= 0xDFFF) || character > 0x10FFFF) + //check for utf-8 surrogate pair + if (character >= 0xD800 && character <= 0xDFFF) + { + for (var k = 0; k < codeLength; ++k) + { + Skip(); + } + + if (analyzer.Peek(0) == '\\' && + (analyzer.Peek(1) == 'u' || analyzer.Peek(1) == 'U')) + { + Skip(); //escape character + if (analyzer.Peek(0) == 'u') + { + codeLength = 4; + } + else + { + codeLength = 8; + } + Skip(); //escape code + + var lowSurrogate = 0; + + // Scan the character value. + for (var k = 0; k < codeLength; ++k) + { + if (!analyzer.IsHex(0)) + { + throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, did not find expected hexadecimal number."); + } + lowSurrogate = ((lowSurrogate << 4) + analyzer.AsHex(k)); + } + + for (var k = 0; k < codeLength; ++k) + { + Skip(); + } + + character = char.ConvertToUtf32((char)character, (char)lowSurrogate); + } + else + { + throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode surrogates."); + } + } + else if (character > 0x10FFFF) { throw new SyntaxErrorException(start, cursor.Mark(), "While scanning a quoted scalar, found invalid Unicode character escape code."); } + else + { + // Advance the pointer. - value.Append(char.ConvertFromUtf32(character)); - - // Advance the pointer. + for (var k = 0; k < codeLength; ++k) + { + Skip(); + } - for (var k = 0; k < codeLength; ++k) - { - Skip(); } + + value.Append(char.ConvertFromUtf32(character)); } } else From 53e3c1ba0e1b8cefc12ae355e063648f6a3b3627 Mon Sep 17 00:00:00 2001 From: Edward Cooke Date: Mon, 28 Aug 2023 16:45:56 -0600 Subject: [PATCH 2/2] Added test for UTF16 in Yaml --- YamlDotNet.Test/Core/ScannerTests.cs | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/YamlDotNet.Test/Core/ScannerTests.cs b/YamlDotNet.Test/Core/ScannerTests.cs index 6ef9c159..eb3cd93a 100644 --- a/YamlDotNet.Test/Core/ScannerTests.cs +++ b/YamlDotNet.Test/Core/ScannerTests.cs @@ -1,4 +1,4 @@ -// This file is part of YamlDotNet - A .NET library for YAML. +// This file is part of YamlDotNet - A .NET library for YAML. // Copyright (c) Antoine Aubry and contributors // // Permission is hereby granted, free of charge, to any person obtaining a copy of @@ -544,6 +544,20 @@ public void Utf16StringsAsUtf8SurrogatesWorkCorrectly() StreamEnd); } + [Fact] + public void Utf16CharactersAreReadCorrectly() + { + AssertSequenceOfTokensFrom(Yaml.ScannerForText("Test: \"\uD83D\uDC4D\""), + StreamStart, + BlockMappingStart, + Key, + PlainScalar("Test"), + Value, + DoubleQuotedScalar("\uD83D\uDC4D"), // guaranteed thumbs up emoticon that will work in Windows Terminal since it pukes on displaying it. + BlockEnd, + StreamEnd); + } + private void AssertPartialSequenceOfTokensFrom(Scanner scanner, params Token[] tokens) { var tokenNumber = 1;