Skip to content

Commit

Permalink
feat(html): add normalize function for HTML entities (denoland#4523)
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe committed Mar 26, 2024
1 parent 1c38d2c commit f726a8d
Show file tree
Hide file tree
Showing 2 changed files with 180 additions and 3 deletions.
71 changes: 69 additions & 2 deletions html/entities.ts
Expand Up @@ -22,6 +22,9 @@ const rawToEntity = new Map<string, string>(rawToEntityEntries);

const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");

/** Options for {@linkcode escape}. */
export type EscapeOptions = { form: NormalizationForm };

/**
* Escapes text for safe interpolation into HTML text content and quoted attributes.
*
Expand All @@ -34,10 +37,18 @@ const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g");
* // Characters that don't need to be escaped will be left alone,
* // even if named HTML entities exist for them.
* escape("þð"); // "þð"
* // You can force non-ASCII chars to be escaped by setting the `form` option to `compatibility`:
* escape("þð", { form: "compatibility" }); // "&#xfe;&#xf0;"
* ```
*/
export function escape(str: string): string {
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
export function escape(
str: string,
options: Partial<EscapeOptions> = {},
): string {
const escaped = str.replaceAll(rawRe, (m) => rawToEntity.get(m)!);
return options.form === "compatibility"
? escapeAllNonAsciiPrintable(escaped)
: escaped;
}

/** Options for {@linkcode unescape}. */
Expand Down Expand Up @@ -103,3 +114,59 @@ function codePointStrToChar(codePointStr: string, radix: number) {

return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint);
}

/**
* Normalization form to use for escaping. See {@linkcode normalize} for examples.
*
* - `readability`: Optimize for human readability and file size.
* - `compatibility`: Optimize for compatibility across boundaries that lack
* full Unicode support, are unaware of encoding, or fail to respect
* encoding.
*/
export type NormalizationForm =
| "readability"
| "compatibility";

export type NormalizationOptions = {
form: NormalizationForm;
};

/**
* Normalize HTML or XML entities in a string of markup.
*
* @example
* ```ts
* normalize("&#x3e;"); // "&gt;"
* normalize("&apos;"); // "&#39;"
* normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"); // "两只小蜜蜂"
*
* // specifying a `form` option (default is `readability`):
* normalize("两只小蜜蜂", { form: "readability" }); // "两只小蜜蜂"
* normalize("两只小蜜蜂", { form: "compatibility" }); // "&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"
* ```
*/
export function normalize(
str: string,
options: Partial<NormalizationOptions> = { form: "readability" },
) {
return str
.split(/([<>'"]+)/)
.map((segment, i) => {
return i % 2
? segment
: escape(unescape(segment), { form: options.form });
})
.join("");
}

function escapeAllCharsAsHex(str: string) {
return [...str].map((c) => `&#x${c.codePointAt(0)!.toString(16)};`).join("");
}

function escapeAllNonAsciiPrintable(str: string) {
return str.replaceAll(
// deno-lint-ignore no-control-regex
/[\x00-\x08\x0b\x0c\x0e-\x1F\x7F-\u{10ffff}]+/gu,
(m) => escapeAllCharsAsHex(m),
);
}
112 changes: 111 additions & 1 deletion html/entities_test.ts
@@ -1,6 +1,6 @@
// Copyright 2018-2024 the Deno authors. All rights reserved. MIT license.

import { escape, unescape } from "./entities.ts";
import { escape, normalize, unescape } from "./entities.ts";
import { assertEquals } from "../assert/mod.ts";
import entityList from "./named_entity_list.json" with { type: "json" };

Expand All @@ -20,6 +20,43 @@ Deno.test("escape()", async (t) => {
assertEquals(escape("þð"), "þð");
},
);
await t.step(
"doesn't escape non-ascii text by default",
() => {
assertEquals(escape("两只小蜜蜂 🐝🐝"), "两只小蜜蜂 🐝🐝");
},
);
await t.step(
"doesn't escape non-ascii text when `form` is `readability`",
() => {
assertEquals(
escape("两只小蜜蜂 🐝🐝", { form: "readability" }),
"两只小蜜蜂 🐝🐝",
);
},
);
await t.step(
"escapes non-ascii text when `form` is `compatibility`",
() => {
assertEquals(
escape("两只小蜜蜂 🐝🐝", { form: "compatibility" }),
"&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702; &#x1f41d;&#x1f41d;",
);
assertEquals(escape("þð", { form: "compatibility" }), "&#xfe;&#xf0;");
},
);
await t.step(
"escapes control chars when `form` is `compatibility`",
() => {
assertEquals(escape("\x03", { form: "compatibility" }), "&#x3;");
},
);
await t.step(
"doesn't escape ASCII whitespace chars when `form` is `compatibility`",
() => {
assertEquals(escape(" \r\n\t", { form: "compatibility" }), " \r\n\t");
},
);
});

Deno.test("unescape()", async (t) => {
Expand Down Expand Up @@ -108,3 +145,76 @@ Deno.test("unescape()", async (t) => {
);
});
});

Deno.test("normalize()", async (t) => {
await t.step(
"normalizes unnecessarily escaped non-ascii chars by default",
() => {
assertEquals(
normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;"),
"两只小蜜蜂",
);
assertEquals(normalize("两只小蜜蜂"), "两只小蜜蜂");
},
);
await t.step(
"normalizes unnecessarily escaped non-ascii chars if `form` is `readability`",
() => {
assertEquals(
normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;", {
form: "readability",
}),
"两只小蜜蜂",
);
assertEquals(
normalize("两只小蜜蜂", { form: "readability" }),
"两只小蜜蜂",
);
},
);
await t.step(
"normalizes non-ascii chars to escaped form if `form` is `compatibility`",
() => {
assertEquals(
normalize("两只小蜜蜂", { form: "compatibility" }),
"&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;",
);
assertEquals(
normalize("&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;", {
form: "compatibility",
}),
"&#x4e24;&#x53ea;&#x5c0f;&#x871c;&#x8702;",
);
},
);
await t.step("leaves markup untouched", () => {
const markup = `<tag attr1="dbl" attr2='sgl' />`;
assertEquals(normalize(markup), markup);
assertEquals(normalize(markup, { form: "readability" }), markup);
assertEquals(normalize(markup, { form: "compatibility" }), markup);
});
await t.step("normalizes unescaped & to &amp;", () => {
assertEquals(normalize("a&b"), "a&amp;b");
assertEquals(normalize("a&b", { form: "readability" }), "a&amp;b");
assertEquals(normalize("a&b", { form: "compatibility" }), "a&amp;b");
});
await t.step("normalizes other forms of entities to a canonical form", () => {
assertEquals(normalize("&#62;&#x3e;&gt;"), "&gt;&gt;&gt;");
assertEquals(
normalize("&#62;&#x3e;&gt;", { form: "readability" }),
"&gt;&gt;&gt;",
);
assertEquals(
normalize("&#62;&#x3e;&gt;", { form: "compatibility" }),
"&gt;&gt;&gt;",
);
});
await t.step(
"normalizes &apos; to &#39; (for compliance with HTML 4.01 Strict)",
() => {
assertEquals(normalize("&apos;"), "&#39;");
assertEquals(normalize("&apos;", { form: "readability" }), "&#39;");
assertEquals(normalize("&apos;", { form: "compatibility" }), "&#39;");
},
);
});

0 comments on commit f726a8d

Please sign in to comment.