fix: decode html entities correctly (#8047)

fixes #8026

1. replace the big entities list with entities that have a ; at the end where valid (there are some exceptions)
2. construct two regexes from that entities list: one for HTML where it's strictly matched, one for attributes where it tries to match browser behavior by also allowing = / digit / character afterwards
3. decode character references with one of these regexes depending on this is an attribute value or not

---------

Co-authored-by: Yuichiro Yamashita <xydybaseball@gmail.com>
pull/8328/head
xxkl1 2 years ago committed by GitHub
parent aa15a64cbe
commit f34abc5689
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -519,7 +519,7 @@ function read_sequence(parser: Parser, done: () => boolean, location: string): T
function flush(end: number) { function flush(end: number) {
if (current_chunk.raw) { if (current_chunk.raw) {
current_chunk.data = decode_character_references(current_chunk.raw); current_chunk.data = decode_character_references(current_chunk.raw, true);
current_chunk.end = end; current_chunk.end = end;
chunks.push(current_chunk); chunks.push(current_chunk);
} }

@ -19,7 +19,7 @@ export default function text(parser: Parser) {
end: parser.index, end: parser.index,
type: 'Text', type: 'Text',
raw: data, raw: data,
data: decode_character_references(data) data: decode_character_references(data, false)
}; };
parser.current().children.push(node); parser.current().children.push(node);

File diff suppressed because it is too large Load Diff

@ -35,12 +35,32 @@ const windows_1252 = [
376 376
]; ];
function reg_exp_entity(entity_name: string, is_attribute_value: boolean) {
// https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
// doesn't decode the html entity which not ends with ; and next character is =, number or alphabet in attribute value.
if (is_attribute_value && !entity_name.endsWith(';')) {
return `${entity_name}\\b(?!=)`;
}
return entity_name;
}
function get_entity_pattern(is_attribute_value: boolean) {
const reg_exp_num = '#(?:x[a-fA-F\\d]+|\\d+)(?:;)?';
const reg_exp_entities = Object.keys(entities).map(entity_name => reg_exp_entity(entity_name, is_attribute_value));
const entity_pattern = new RegExp( const entity_pattern = new RegExp(
`&(#?(?:x[\\w\\d]+|\\d+|${Object.keys(entities).join('|')}))(?:;|\\b)`, `&(${reg_exp_num}|${reg_exp_entities.join('|')})`,
'g' 'g'
); );
export function decode_character_references(html: string) { return entity_pattern;
}
const entity_pattern_content = get_entity_pattern(false);
const entity_pattern_attr_value = get_entity_pattern(true);
export function decode_character_references(html: string, is_attribute_value: boolean) {
const entity_pattern = is_attribute_value ? entity_pattern_attr_value : entity_pattern_content;
return html.replace(entity_pattern, (match, entity) => { return html.replace(entity_pattern, (match, entity) => {
let code; let code;

@ -0,0 +1,19 @@
export default {
html: `
<span data-xxx="&amp;copy=value" style="&amp;copy=value"></span>
<span data-xxx="&amp;copy=value" style="&amp;copy=value"></span>
<span data-xxx="©" style="©"></span>
<span data-xxx="©=value" style="©=value"></span>
<span data-xxx="&amp;copyotherstring=value" style="&amp;copyotherstring=value"></span>
<span data-xxx="&amp;copy123=value" style="&amp;copy123=value"></span>
<span data-xxx="&amp;rect=value" style="&amp;rect=value"></span>
<span data-xxx="▭=value" style="▭=value"></span>
`
};

@ -0,0 +1,15 @@
<span data-xxx="&copy=value" style="&copy=value"></span>
<span data-xxx="&copy=value" style="&copy=value"></span>
<span data-xxx="&copy" style="&copy"></span>
<span data-xxx="&copy;=value" style="&copy;=value"></span>
<span data-xxx="&copyotherstring=value" style="&copyotherstring=value"></span>
<span data-xxx="&copy123=value" style="&copy123=value"></span>
<span data-xxx="&rect=value" style="&rect=value"></span>
<span data-xxx="&rect;=value" style="&rect;=value"></span>

@ -4,11 +4,25 @@ export default {
<span>*</span> <span>*</span>
<span>*</span> <span>*</span>
<span>*</span> <span>*</span>
<span>*</span>
<span></span> <span></span>
<span>A</span> <span>A</span>
<span></span> <span></span>
<span></span>
<span>&amp;stringnotanentity;</span>
<span>different &amp;rect and </span>
<span>©otherstring</span>
<span>©=otherstring</span>
<span>©=otherstring</span>
<span>©123</span>
<span>&amp;notanentity;</span> <span>Ÿotherstring</span>
` `
}; };

@ -1,10 +1,24 @@
<span>&ast;</span> <span>&ast;</span>
<span>&midast;</span> <span>&midast;</span>
<span>&#x0002A;</span> <span>&#x0002A;</span>
<span>&#x0002A</span>
<span>&#42;</span> <span>&#42;</span>
<span>&#10;</span> <span>&#10;</span>
<span>&#65;</span> <span>&#65;</span>
<span>&#128;</span> <span>&#128;</span>
<span>&#128</span>
<span>&notanentity;</span> <span>&stringnotanentity;</span>
<span>different &rect and &rect;</span>
<span>&copyotherstring</span>
<span>&copy=otherstring</span>
<span>&copy;=otherstring</span>
<span>&copy123</span>
<span>&#x9fotherstring</span>

Loading…
Cancel
Save