You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
svelte/src/parse/utils/html.ts

78 lines
2.1 KiB

import htmlEntities from './entities';
const windows1252 = [ 8364, 129, 8218, 402, 8222, 8230, 8224, 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, 8482, 353, 8250, 339, 157, 382, 376 ];
const entityPattern = new RegExp( `&(#?(?:x[\\w\\d]+|\\d+|${Object.keys( htmlEntities ).join( '|' )}));?`, 'g' );
export function decodeCharacterReferences ( html: string ) {
return html.replace( entityPattern, ( match, entity ) => {
let code;
// Handle named entities
if ( entity[0] !== '#' ) {
code = htmlEntities[ entity ];
} else if ( entity[1] === 'x' ) {
code = parseInt( entity.substring( 2 ), 16 );
} else {
code = parseInt( entity.substring( 1 ), 10 );
}
if ( !code ) {
return match;
}
return String.fromCodePoint( validateCode( code ) );
});
}
const NUL = 0;
// some code points are verboten. If we were inserting HTML, the browser would replace the illegal
// code points with alternatives in some cases - since we're bypassing that mechanism, we need
// to replace them ourselves
//
// Source: http://en.wikipedia.org/wiki/Character_encodings_in_HTML#Illegal_characters
function validateCode ( code: number ) {
// line feed becomes generic whitespace
if ( code === 10 ) {
return 32;
}
// ASCII range. (Why someone would use HTML entities for ASCII characters I don't know, but...)
if ( code < 128 ) {
return code;
}
// code points 128-159 are dealt with leniently by browsers, but they're incorrect. We need
// to correct the mistake or we'll end up with missing € signs and so on
if ( code <= 159 ) {
return windows1252[ code - 128 ];
}
// basic multilingual plane
if ( code < 55296 ) {
return code;
}
// UTF-16 surrogate halves
if ( code <= 57343 ) {
return NUL;
}
// rest of the basic multilingual plane
if ( code <= 65535 ) {
return code;
}
// supplementary multilingual plane 0x10000 - 0x1ffff
if ( code >= 65536 && code <= 131071 ) {
return code;
}
// supplementary ideographic plane 0x20000 - 0x2ffff
if ( code >= 131072 && code <= 196607 ) {
return code;
}
return NUL;
}