You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
svelte/packages/svelte/src/compiler/phases/1-parse/utils/html.js

176 lines
4.6 KiB

import entities from './entities.js';
const windows_1252 = [
8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216,
8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376
];
/**
* @param {string} entity_name
* @param {boolean} is_attribute_value
*/
function reg_exp_entity(entity_name, is_attribute_value) {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
// doesn't decode the html entity which not ends with ; and next character is =, number or alphabet in attribute value.
if (is_attribute_value && !entity_name.endsWith(';')) {
return `${entity_name}\\b(?!=)`;
}
return entity_name;
}
/** @param {boolean} is_attribute_value */
function get_entity_pattern(is_attribute_value) {
const reg_exp_num = '#(?:x[a-fA-F\\d]+|\\d+)(?:;)?';
const reg_exp_entities = Object.keys(entities).map(
/** @param {any} entity_name */ (entity_name) => reg_exp_entity(entity_name, is_attribute_value)
);
const entity_pattern = new RegExp(`&(${reg_exp_num}|${reg_exp_entities.join('|')})`, 'g');
return entity_pattern;
}
const entity_pattern_content = get_entity_pattern(false);
const entity_pattern_attr_value = get_entity_pattern(true);
/**
* @param {string} html
* @param {boolean} is_attribute_value
*/
export function decode_character_references(html, is_attribute_value) {
const entity_pattern = is_attribute_value ? entity_pattern_attr_value : entity_pattern_content;
return html.replace(
entity_pattern,
/**
* @param {any} match
* @param {keyof typeof entities} entity
*/ (match, entity) => {
let code;
// Handle named entities
if (entity[0] !== '#') {
code = entities[entity];
} else if (entity[1] === 'x') {
code = parseInt(entity.substring(2), 16);
} else {
code = parseInt(entity.substring(1), 10);
}
if (!code) {
return match;
}
return String.fromCodePoint(validate_code(code));
}
);
}
const NUL = 0;
// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
// to replace them ourselves
//
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
/** @param {number} code */
function validate_code(code) {
// line feed becomes generic whitespace
if (code === 10) {
return 32;
}
// ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
if (code < 128) {
return code;
}
// code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
// to correct the mistake or we'll end up with missing € signs and so on
if (code <= 159) {
return windows_1252[code - 128];
}
// basic multilingual plane
if (code < 55296) {
return code;
}
// UTF-16 surrogate halves
if (code <= 57343) {
return NUL;
}
// rest of the basic multilingual plane
if (code <= 65535) {
return code;
}
// supplementary multilingual plane 0x10000 - 0x1ffff
if (code >= 65536 && code <= 131071) {
return code;
}
// supplementary ideographic plane 0x20000 - 0x2ffff
if (code >= 131072 && code <= 196607) {
return code;
}
return NUL;
}
// based on http://developers.whatwg.org/syntax.html#syntax-tag-omission
// while `input` is also an interactive element, it is never moved by the browser, so we don't need to check for it
export const interactive_elements = new Set([
'a',
'button',
'iframe',
'embed',
'select',
'textarea'
]);
/** @type {Record<string, Set<string>>} */
const disallowed_contents = {
li: new Set(['li']),
dt: new Set(['dt', 'dd']),
dd: new Set(['dt', 'dd']),
p: new Set(
'address article aside blockquote div dl fieldset footer form h1 h2 h3 h4 h5 h6 header hgroup hr main menu nav ol p pre section table ul'.split(
' '
)
),
rt: new Set(['rt', 'rp']),
rp: new Set(['rt', 'rp']),
optgroup: new Set(['optgroup']),
option: new Set(['option', 'optgroup']),
thead: new Set(['tbody', 'tfoot']),
tbody: new Set(['tbody', 'tfoot']),
tfoot: new Set(['tbody']),
tr: new Set(['tr', 'tbody']),
td: new Set(['td', 'th', 'tr']),
th: new Set(['td', 'th', 'tr'])
};
for (const interactive_element of interactive_elements) {
disallowed_contents[interactive_element] = interactive_elements;
}
// can this be a child of the parent element, or does it implicitly
// close it, like `<li>one<li>two`?
/**
* @param {string} current
* @param {string} [next]
*/
export function closing_tag_omitted(current, next) {
if (disallowed_contents[current]) {
if (!next || disallowed_contents[current].has(next)) {
return true;
}
}
return false;
}