fix(parser): html entities evaluated (#738)
This commit is contained in:
parent
d6a9afb8e1
commit
268186c27d
6 changed files with 41 additions and 88 deletions
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
import read_expression from '../read/expression.js';
|
import read_expression from '../read/expression.js';
|
||||||
import read_style from '../read/style.js';
|
import read_style from '../read/style.js';
|
||||||
import { decode_character_references, closing_tag_omitted } from '../utils/html.js';
|
import { closing_tag_omitted } from '../utils/html.js';
|
||||||
import { is_void } from '../../utils/names.js';
|
import { is_void } from '../../utils/names.js';
|
||||||
import { Parser } from '../index.js';
|
import { Parser } from '../index.js';
|
||||||
import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js';
|
import { Directive, DirectiveType, TemplateNode, Text } from '../../interfaces.js';
|
||||||
|
@ -533,7 +533,7 @@ export function read_sequence(parser: Parser, done: () => boolean): TemplateNode
|
||||||
|
|
||||||
function flush() {
|
function flush() {
|
||||||
if (current_chunk.raw) {
|
if (current_chunk.raw) {
|
||||||
current_chunk.data = decode_character_references(current_chunk.raw);
|
current_chunk.data = current_chunk.raw;
|
||||||
current_chunk.end = parser.index;
|
current_chunk.end = parser.index;
|
||||||
chunks.push(current_chunk);
|
chunks.push(current_chunk);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
// @ts-nocheck
|
// @ts-nocheck
|
||||||
|
|
||||||
import { decode_character_references } from '../utils/html.js';
|
|
||||||
import { Parser } from '../index.js';
|
import { Parser } from '../index.js';
|
||||||
|
|
||||||
export default function text(parser: Parser) {
|
export default function text(parser: Parser) {
|
||||||
|
@ -25,7 +24,7 @@ export default function text(parser: Parser) {
|
||||||
end: parser.index,
|
end: parser.index,
|
||||||
type: 'Text',
|
type: 'Text',
|
||||||
raw: data,
|
raw: data,
|
||||||
data: decode_character_references(data),
|
data,
|
||||||
};
|
};
|
||||||
|
|
||||||
parser.current().children.push(node);
|
parser.current().children.push(node);
|
||||||
|
|
|
@ -1,86 +1,3 @@
|
||||||
// @ts-nocheck
|
|
||||||
|
|
||||||
import entities from './entities.js';
|
|
||||||
|
|
||||||
const windows_1252 = [
|
|
||||||
8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376,
|
|
||||||
];
|
|
||||||
|
|
||||||
const entity_pattern = new RegExp(`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, 'g');
|
|
||||||
|
|
||||||
export function decode_character_references(html: string) {
|
|
||||||
return html.replace(entity_pattern, (match, entity) => {
|
|
||||||
let code;
|
|
||||||
|
|
||||||
// Handle named entities
|
|
||||||
if (entity[0] !== '#') {
|
|
||||||
code = entities[entity];
|
|
||||||
} else if (entity[1] === 'x') {
|
|
||||||
code = parseInt(entity.substring(2), 16);
|
|
||||||
} else {
|
|
||||||
code = parseInt(entity.substring(1), 10);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!code) {
|
|
||||||
return match;
|
|
||||||
}
|
|
||||||
|
|
||||||
return String.fromCodePoint(validate_code(code));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
const NUL = 0;
|
|
||||||
|
|
||||||
// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
|
|
||||||
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
|
|
||||||
// to replace them ourselves
|
|
||||||
//
|
|
||||||
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
|
|
||||||
function validate_code(code: number) {
|
|
||||||
// line feed becomes generic whitespace
|
|
||||||
if (code === 10) {
|
|
||||||
return 32;
|
|
||||||
}
|
|
||||||
|
|
||||||
// ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
|
|
||||||
if (code < 128) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
// code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
|
|
||||||
// to correct the mistake or we'll end up with missing € signs and so on
|
|
||||||
if (code <= 159) {
|
|
||||||
return windows_1252[code - 128];
|
|
||||||
}
|
|
||||||
|
|
||||||
// basic multilingual plane
|
|
||||||
if (code < 55296) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
// UTF-16 surrogate halves
|
|
||||||
if (code <= 57343) {
|
|
||||||
return NUL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// rest of the basic multilingual plane
|
|
||||||
if (code <= 65535) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
// supplementary multilingual plane 0x10000 - 0x1ffff
|
|
||||||
if (code >= 65536 && code <= 131071) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
// supplementary ideographic plane 0x20000 - 0x2ffff
|
|
||||||
if (code >= 131072 && code <= 196607) {
|
|
||||||
return code;
|
|
||||||
}
|
|
||||||
|
|
||||||
return NUL;
|
|
||||||
}
|
|
||||||
|
|
||||||
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
|
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
|
||||||
const disallowed_contents = new Map([
|
const disallowed_contents = new Map([
|
||||||
['li', new Set(['li'])],
|
['li', new Set(['li'])],
|
||||||
|
@ -103,7 +20,7 @@ const disallowed_contents = new Map([
|
||||||
// close it, like `<li>one<li>two`?
|
// close it, like `<li>one<li>two`?
|
||||||
export function closing_tag_omitted(current: string, next?: string) {
|
export function closing_tag_omitted(current: string, next?: string) {
|
||||||
if (disallowed_contents.has(current)) {
|
if (disallowed_contents.has(current)) {
|
||||||
if (!next || disallowed_contents.get(current).has(next)) {
|
if (!next || disallowed_contents.get(current)!.has(next)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
3
packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json
vendored
Normal file
3
packages/astro/test/fixtures/html-encoded-characters/snowpack.config.json
vendored
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"workspaceRoot": "../../../../../"
|
||||||
|
}
|
11
packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro
vendored
Normal file
11
packages/astro/test/fixtures/html-encoded-characters/src/pages/index.astro
vendored
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
---
|
||||||
|
---
|
||||||
|
<html>
|
||||||
|
<head><title>HTML Encoded Characters</title></head>
|
||||||
|
<body>
|
||||||
|
<h1> Hello, world;</h1>
|
||||||
|
<div>
|
||||||
|
<p>Nested elements? No problem. </p>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
23
packages/astro/test/html-encoded-characters.test.js
Normal file
23
packages/astro/test/html-encoded-characters.test.js
Normal file
|
@ -0,0 +1,23 @@
|
||||||
|
import { suite } from 'uvu';
|
||||||
|
import * as assert from 'uvu/assert';
|
||||||
|
import { doc } from './test-utils.js';
|
||||||
|
import { setup } from './helpers.js';
|
||||||
|
|
||||||
|
const HtmlEncodedChars = suite('HTML Encoded Characters');
|
||||||
|
|
||||||
|
setup(HtmlEncodedChars, './fixtures/html-encoded-characters');
|
||||||
|
|
||||||
|
HtmlEncodedChars("doesn't decode html entities", async ({ runtime }) => {
|
||||||
|
const result = await runtime.load('/');
|
||||||
|
if (result.error) throw new Error(result.error);
|
||||||
|
|
||||||
|
const $ = doc(result.contents);
|
||||||
|
// Note: although this may look like it's incorrectly decoding the chars,
|
||||||
|
// Cheerio is showing how the browsers _should_ interpret the HTML. If it
|
||||||
|
// wasn't working correctly, then the spaces would have been trimmed to a
|
||||||
|
// single space.
|
||||||
|
assert.equal($('h1').html(), ' Hello, world;');
|
||||||
|
assert.equal($('div p').html(), 'Nested elements? No problem. ');
|
||||||
|
});
|
||||||
|
|
||||||
|
HtmlEncodedChars.run();
|
Loading…
Add table
Reference in a new issue