New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(html): add normalize
function for HTML entities (#4523)
#4524
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,10 @@ | |
|
||
const rawRe = new RegExp(`[${[...rawToEntity.keys()].join("")}]`, "g"); | ||
|
||
/** Options for {@linkcode escape}. */ | ||
export type EscapeOptions = { form: NormalizationForm }; | ||
const defaultEscapeOptions: EscapeOptions = { form: "readability" }; | ||
|
||
/** | ||
* Escapes text for safe interpolation into HTML text content and quoted attributes. | ||
* | ||
|
@@ -34,15 +38,23 @@ | |
* // Characters that don't need to be escaped will be left alone, | ||
* // even if named HTML entities exist for them. | ||
* escape("þð"); // "þð" | ||
* // You can force non-ASCII chars to be escaped by setting the `form` option to `compatibility`: | ||
* escape("þð", { form: "compatibility" }); // "þð" | ||
* ``` | ||
*/ | ||
export function escape(str: string): string { | ||
return str.replaceAll(rawRe, (m) => rawToEntity.get(m)!); | ||
export function escape( | ||
str: string, | ||
options?: Partial<EscapeOptions>, | ||
): string { | ||
const { form } = { ...defaultEscapeOptions, ...options }; | ||
const escaped = str.replaceAll(rawRe, (m) => rawToEntity.get(m)!); | ||
return form === "compatibility" | ||
? escapeNonAsciiPrintable(escaped) | ||
: escapeXmlRestricted(escaped); | ||
} | ||
|
||
/** Options for {@linkcode unescape}. */ | ||
export type UnescapeOptions = { entityList: EntityList }; | ||
|
||
const defaultUnescapeOptions: UnescapeOptions = { | ||
entityList: defaultEntityList, | ||
}; | ||
|
@@ -73,7 +85,7 @@ | |
*/ | ||
export function unescape( | ||
str: string, | ||
options: Partial<UnescapeOptions> = {}, | ||
options?: Partial<UnescapeOptions>, | ||
): string { | ||
const { entityList } = { ...defaultUnescapeOptions, ...options }; | ||
|
||
|
@@ -103,3 +115,73 @@ | |
|
||
return codePoint > MAX_CODE_POINT ? "�" : String.fromCodePoint(codePoint); | ||
} | ||
|
||
/** | ||
* Normalization form to use for escaping. See {@linkcode normalize} for examples. | ||
* | ||
* - `readability`: Optimize for human readability and file size. | ||
* - `compatibility`: Optimize for compatibility across boundaries that lack | ||
* full Unicode support, are unaware of encoding, or fail to respect | ||
* encoding. | ||
*/ | ||
export type NormalizationForm = | ||
| "readability" | ||
| "compatibility"; | ||
|
||
export type NormalizationOptions = { form: NormalizationForm }; | ||
const defaultNormalizationOptions: NormalizationOptions = { | ||
form: "readability", | ||
}; | ||
|
||
/** | ||
* Normalize HTML or XML entities in a string of markup. | ||
* | ||
* @example | ||
* ```ts | ||
* import { normalize } from "https://deno.land/std@$STD_VERSION/html/entities.ts"; | ||
* | ||
* normalize(">"); // ">" | ||
* normalize("'"); // "'" | ||
* normalize("两只小蜜蜂"); // "两只小蜜蜂" | ||
* | ||
* // other markup is left untouched | ||
* normalize("<p class='foo'>🌈</p>") // "<p class='foo'>🌈</p>" | ||
* | ||
* // specifying a `form` option (default is `readability`): | ||
* normalize("两只小蜜蜂", { form: "readability" }); // "两只小蜜蜂" | ||
* normalize("两只小蜜蜂", { form: "compatibility" }); // "两只小蜜蜂" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
* ``` | ||
*/ | ||
export function normalize( | ||
str: string, | ||
options?: Partial<NormalizationOptions>, | ||
) { | ||
const { form } = { ...defaultNormalizationOptions, ...options }; | ||
return str | ||
.split(/([<>'"]+)/) | ||
.map((segment, i) => i % 2 ? segment : escape(unescape(segment), { form })) | ||
.join(""); | ||
} | ||
|
||
/** | ||
* See https://en.wikipedia.org/wiki/Valid_characters_in_XML#Non-restricted_characters | ||
*/ | ||
function escapeXmlRestricted(str: string) { | ||
return str.replaceAll( | ||
// deno-lint-ignore no-control-regex | ||
/[^\x09\x0a\x0d\x20-\x7e\x85\xa0-\ud7ff\ue000-\ufdcf\ufdf0-\ufffd\u{10000}-\u{1fffd}\u{20000}-\u{2fffd}\u{30000}-\u{3fffd}\u{40000}-\u{4fffd}\u{50000}-\u{5fffd}\u{60000}-\u{6fffd}\u{70000}-\u{7fffd}\u{80000}-\u{8fffd}\u{90000}-\u{9fffd}\u{a0000}-\u{afffd}\u{b0000}-\u{bfffd}\u{c0000}-\u{cfffd}\u{d0000}-\u{dfffd}\u{e0000}-\u{efffd}\u{f0000}-\u{ffffd}\u{100000}-\u{10fffd}]+/gu, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we start escaping these chars by default? Also does HTML have the same restricted chars concept as XML? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Upon reflection, this PR needs some more thought. I'd initially thought a sort of "baseline-compatible with both HTML and XML" would be a sensible default, given that these characters are rare in practice yet could cause problems in XML. But it turns out that entities for certain C1 control-character codepoints have different semantics in HTML than XML — for example, ['application/xml', 'text/html'].map((contentType) => {
const { literal, entity } = JSON.parse(
new DOMParser().parseFromString(
'<div>{"literal": "\x80", "entity": "€"}</div>',
contentType,
).querySelector('div').textContent,
)
return { contentType, literal, entity }
})
// { "contentType": "application/xml", "literal": "\x80", "entity": "\x80" }
// { "contentType": "text/html", "literal": "\x80", "entity": "€" } Also, I'm not sure converting those characters to entities is the right approach. Probably simply stripping them out would be more sensible in most cases. |
||
(m) => escapeAllCharsAsHex(m), | ||
); | ||
} | ||
|
||
function escapeNonAsciiPrintable(str: string) { | ||
return str.replaceAll( | ||
// deno-lint-ignore no-control-regex | ||
/[^\x09\x0a\x0d\x20-\x7e]+/gu, | ||
(m) => escapeAllCharsAsHex(m), | ||
); | ||
} | ||
|
||
function escapeAllCharsAsHex(str: string) { | ||
return [...str].map((c) => `&#x${c.codePointAt(0)!.toString(16)};`).join(""); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
form: "compatibility"
sounds confusing to me as I don't see what it's compatible with.