diff --git a/packages/astro-parser/src/parse/state/tag.ts b/packages/astro-parser/src/parse/state/tag.ts index f3d30b06d..70fa9e361 100644 --- a/packages/astro-parser/src/parse/state/tag.ts +++ b/packages/astro-parser/src/parse/state/tag.ts @@ -2,7 +2,7 @@ import read_expression from '../read/expression.js'; import read_style from '../read/style.js'; -import { closing_tag_omitted } from '../utils/html.js'; +import { decode_character_references, closing_tag_omitted } from '../utils/html.js'; import { is_void } from '../../utils/names.js'; import { Parser } from '../index.js'; import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js'; @@ -533,7 +533,7 @@ export function read_sequence(parser: Parser, done: () => boolean): TemplateNode function flush() { if (current_chunk.raw) { - current_chunk.data = current_chunk.raw; + current_chunk.data = decode_character_references(current_chunk.raw); current_chunk.end = parser.index; chunks.push(current_chunk); } diff --git a/packages/astro-parser/src/parse/state/text.ts b/packages/astro-parser/src/parse/state/text.ts index dec284ae4..020d066fd 100644 --- a/packages/astro-parser/src/parse/state/text.ts +++ b/packages/astro-parser/src/parse/state/text.ts @@ -1,5 +1,6 @@ // @ts-nocheck +import { decode_character_references } from '../utils/html.js'; import { Parser } from '../index.js'; export default function text(parser: Parser) { @@ -24,7 +25,7 @@ export default function text(parser: Parser) { end: parser.index, type: 'Text', raw: data, - data, + data: decode_character_references(data), }; parser.current().children.push(node); diff --git a/packages/astro-parser/src/parse/utils/html.ts b/packages/astro-parser/src/parse/utils/html.ts index 9988174f3..e4669a2db 100644 --- a/packages/astro-parser/src/parse/utils/html.ts +++ b/packages/astro-parser/src/parse/utils/html.ts @@ -1,3 +1,86 @@ +// @ts-nocheck + +import entities from './entities.js'; + +const windows_1252 = [ + 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376, +]; + +const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g'); + +export function decode_character_references(html: string) { + return html.replace(entity_pattern, (match, entity) => { + let code; + + // Handle named entities + if (entity[0] !== '#') { + code = entities[entity]; + } else if (entity[1] === 'x') { + code = parseInt(entity.substring(2), 16); + } else { + code = parseInt(entity.substring(1), 10); + } + + if (!code) { + return match; + } + + return String.fromCodePoint(validate_code(code)); + }); +} + +const NUL = 0; + +// some code points are verboten. If we were inserting HTML, the browser would replace the illegal +// code points with alternatives in some cases - since we're bypassing that mechanism, we need +// to replace them ourselves +// +// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters +function validate_code(code: number) { + // line feed becomes generic whitespace + if (code === 10) { + return 32; + } + + // ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...) + if (code < 128) { + return code; + } + + // code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need + // to correct the mistake or we'll end up with missing € signs and so on + if (code <= 159) { + return windows_1252[code - 128]; + } + + // basic multilingual plane + if (code < 55296) { + return code; + } + + // UTF-16 surrogate halves + if (code <= 57343) { + return NUL; + } + + // rest of the basic multilingual plane + if (code <= 65535) { + return code; + } + + // supplementary multilingual plane 0x10000 - 0x1ffff + if (code >= 65536 && code <= 131071) { + return code; + } + + // supplementary ideographic plane 0x20000 - 0x2ffff + if (code >= 131072 && code <= 196607) { + return code; + } + + return NUL; +} + // based on http://developers.whatwg.org/syntax.html#syntax-tag-omission const disallowed_contents = new Map([ ['li', new Set(['li'])], @@ -20,7 +103,7 @@ const disallowed_contents = new Map([ // close it, like `
Nested elements? No problem.
-