denoland · lionel-rowe · Mar 26, 2024 · Mar 26, 2024 · Mar 27, 2024 · kt3k
@@ -22,6 +22,10 @@
 
 const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");
 
+/** Options for {@linkcode escape}. */
+export type EscapeOptions = { form: NormalizationForm };
+const defaultEscapeOptions: EscapeOptions = { form: "readability" };
+
 /**
  * Escapes text for safe interpolation into HTML text content and quoted attributes.
  *
@@ -34,15 +38,23 @@
  * // Characters that don't need to be escaped will be left alone,
  * // even if named HTML entities exist for them.
  * escape("þð"); // "þð"
+ * // You can force non-ASCII chars to be escaped by setting the `form` option to `compatibility`:
+ * escape("þð", { form: "compatibility" }); // "&#xfe;&#xf0;"
  * ```
  */
-export function escape(str: string): string {
-  return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
+export function escape(
+  str: string,
+  options?: Partial<EscapeOptions>,
+): string {
+  const { form } = { ...defaultEscapeOptions, ...options };
+  const escaped = str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
+  return form === "compatibility"
+    ? escapeNonAsciiPrintable(escaped)
+    : escapeXmlRestricted(escaped);
 }
 
 /** Options for {@linkcode unescape}. */
 export type UnescapeOptions = { entityList: EntityList };
-
 const defaultUnescapeOptions: UnescapeOptions = {
   entityList: defaultEntityList,
 };
@@ -73,7 +85,7 @@
  */
 export function unescape(
   str: string,
-  options: Partial<UnescapeOptions> = {},
+  options?: Partial<UnescapeOptions>,
 ): string {
   const { entityList } = { ...defaultUnescapeOptions, ...options };
 
@@ -103,3 +115,73 @@
 
   return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint);
 }
+
+/**
+ * Normalization form to use for escaping. See {@linkcode normalize} for examples.
+ *
+ * - `readability`: Optimize for human readability and file size.
+ * - `compatibility`: Optimize for compatibility across boundaries that lack
+ *    full Unicode support, are unaware of encoding, or fail to respect
+ *    encoding.
+ */
+export type NormalizationForm =
+  | "readability"
+  | "compatibility";
+
+export type NormalizationOptions = { form: NormalizationForm };
+const defaultNormalizationOptions: NormalizationOptions = {
+  form: "readability",
+};
+
+/**
+ * Normalize HTML or XML entities in a string of markup.
+ *
+ * @example
+ * ```ts
+ * import { normalize } from "https://deno.land/std@$STD_VERSION/html/entities.ts";
+ *
+ * normalize("&#x3e;"); // "&gt;"
+ * normalize("&apos;"); // "&#39;"
+ * normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"); // "两只小蜜蜂"
+ *
+ * // other markup is left untouched
+ * normalize("<p class='foo'>&#x1f308;</p>") // "<p class='foo'>🌈</p>"
+ *
+ * // specifying a `form` option (default is `readability`):
+ * normalize("两只小蜜蜂", { form: "readability" }); // "两只小蜜蜂"
+ * normalize("两只小蜜蜂", { form: "compatibility" }); // "&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"
+ * ```
+ */
+export function normalize(
+  str: string,
+  options?: Partial<NormalizationOptions>,
+) {
+  const { form } = { ...defaultNormalizationOptions, ...options };
+  return str
+    .split(/([<>'"]+)/)
+    .map((segment, i) => i % 2 ? segment : escape(unescape(segment), { form }))
+    .join("");
+}
+
+/**
+ * See https://en.wikipedia.org/wiki/Valid_characters_in_XML#Non-restricted_characters
+ */
+function escapeXmlRestricted(str: string) {
+  return str.replaceAll(
+    // deno-lint-ignore no-control-regex
+    /[^\x09\x0a\x0d\x20-\x7e\x85\xa0-\ud7ff\ue000-\ufdcf\ufdf0-\ufffd\u{10000}-\u{1fffd}\u{20000}-\u{2fffd}\u{30000}-\u{3fffd}\u{40000}-\u{4fffd}\u{50000}-\u{5fffd}\u{60000}-\u{6fffd}\u{70000}-\u{7fffd}\u{80000}-\u{8fffd}\u{90000}-\u{9fffd}\u{a0000}-\u{afffd}\u{b0000}-\u{bfffd}\u{c0000}-\u{cfffd}\u{d0000}-\u{dfffd}\u{e0000}-\u{efffd}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}]+/gu,
+    (m) => escapeAllCharsAsHex(m),
+  );
+}
+
+function escapeNonAsciiPrintable(str: string) {
+  return str.replaceAll(
+    // deno-lint-ignore no-control-regex
+    /[^\x09\x0a\x0d\x20-\x7e]+/gu,
+    (m) => escapeAllCharsAsHex(m),
+  );
+}
+
+function escapeAllCharsAsHex(str: string) {
+  return [...str].map((c) => `&#x${c.codePointAt(0)!.toString(16)};`).join("");
+}
@@ -1,6 +1,6 @@
 // Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.
 
-import { escape, unescape } from "./entities.ts";
+import { escape, normalize, unescape } from "./entities.ts";
 import { assertEquals } from "../assert/mod.ts";
 import entityList from "./named_entity_list.json" with { type: "json" };
 
@@ -20,6 +20,43 @@ Deno.test("escape()", async (t) => {
       assertEquals(escape("þð"), "þð");
     },
   );
+  await t.step(
+    "doesn't escape non-ascii text by default",
+    () => {
+      assertEquals(escape("两只小蜜蜂 🐝🐝"), "两只小蜜蜂 🐝🐝");
+    },
+  );
+  await t.step(
+    "doesn't escape non-ascii text when `form` is `readability`",
+    () => {
+      assertEquals(
+        escape("两只小蜜蜂 🐝🐝", { form: "readability" }),
+        "两只小蜜蜂 🐝🐝",
+      );
+    },
+  );
+  await t.step(
+    "escapes non-ascii text when `form` is `compatibility`",
+    () => {
+      assertEquals(
+        escape("两只小蜜蜂 🐝🐝", { form: "compatibility" }),
+        "&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702; &#x1f41d;&#x1f41d;",
+      );
+      assertEquals(escape("þð", { form: "compatibility" }), "&#xfe;&#xf0;");
+    },
+  );
+  await t.step(
+    "escapes control chars when `form` is `compatibility`",
+    () => {
+      assertEquals(escape("\x03", { form: "compatibility" }), "&#x3;");
+    },
+  );
+  await t.step(
+    "doesn't escape ASCII whitespace chars when `form` is `compatibility`",
+    () => {
+      assertEquals(escape(" \r\n\t", { form: "compatibility" }), " \r\n\t");
+    },
+  );
 });
 
 Deno.test("unescape()", async (t) => {
@@ -108,3 +145,76 @@ Deno.test("unescape()", async (t) => {
     );
   });
 });
+
+Deno.test("normalize()", async (t) => {
+  await t.step(
+    "normalizes unnecessarily escaped non-ascii chars by default",
+    () => {
+      assertEquals(
+        normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"),
+        "两只小蜜蜂",
+      );
+      assertEquals(normalize("两只小蜜蜂"), "两只小蜜蜂");
+    },
+  );
+  await t.step(
+    "normalizes unnecessarily escaped non-ascii chars if `form` is `readability`",
+    () => {
+      assertEquals(
+        normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;", {
+          form: "readability",
+        }),
+        "两只小蜜蜂",
+      );
+      assertEquals(
+        normalize("两只小蜜蜂", { form: "readability" }),
+        "两只小蜜蜂",
+      );
+    },
+  );
+  await t.step(
+    "normalizes non-ascii chars to escaped form if `form` is `compatibility`",
+    () => {
+      assertEquals(
+        normalize("两只小蜜蜂", { form: "compatibility" }),
+        "&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;",
+      );
+      assertEquals(
+        normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;", {
+          form: "compatibility",
+        }),
+        "&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;",
+      );
+    },
+  );
+  await t.step("leaves markup untouched", () => {
+    const markup = `<tag attr1="dbl" attr2='sgl' />`;
+    assertEquals(normalize(markup), markup);
+    assertEquals(normalize(markup, { form: "readability" }), markup);
+    assertEquals(normalize(markup, { form: "compatibility" }), markup);
+  });
+  await t.step("normalizes unescaped & to &amp;", () => {
+    assertEquals(normalize("a&b"), "a&amp;b");
+    assertEquals(normalize("a&b", { form: "readability" }), "a&amp;b");
+    assertEquals(normalize("a&b", { form: "compatibility" }), "a&amp;b");
+  });
+  await t.step("normalizes other forms of entities to a canonical form", () => {
+    assertEquals(normalize("&#62;&#x3e;&gt;"), "&gt;&gt;&gt;");
+    assertEquals(
+      normalize("&#62;&#x3e;&gt;", { form: "readability" }),
+      "&gt;&gt;&gt;",
+    );
+    assertEquals(
+      normalize("&#62;&#x3e;&gt;", { form: "compatibility" }),
+      "&gt;&gt;&gt;",
+    );
+  });
+  await t.step(
+    "normalizes &apos; to &#39; (for compliance with HTML 4.01 Strict)",
+    () => {
+      assertEquals(normalize("&apos;"), "&#39;");
+      assertEquals(normalize("&apos;", { form: "readability" }), "&#39;");
+      assertEquals(normalize("&apos;", { form: "compatibility" }), "&#39;");
+    },
+  );
+});