123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451 |
- 'use strict'
-
- var legacy = require('character-entities-legacy')
- var invalid = require('character-reference-invalid')
- var decimal = require('is-decimal')
- var hexadecimal = require('is-hexadecimal')
- var alphanumerical = require('is-alphanumerical')
- var decodeEntity = require('./decode-entity')
-
- module.exports = parseEntities
-
- var own = {}.hasOwnProperty
- var fromCharCode = String.fromCharCode
- var noop = Function.prototype
-
- // Default settings.
- var defaults = {
- warning: null,
- reference: null,
- text: null,
- warningContext: null,
- referenceContext: null,
- textContext: null,
- position: {},
- additional: null,
- attribute: false,
- nonTerminated: true
- }
-
- // Characters.
- var tab = 9 // '\t'
- var lineFeed = 10 // '\n'
- var formFeed = 12 // '\f'
- var space = 32 // ' '
- var ampersand = 38 // '&'
- var semicolon = 59 // ';'
- var lessThan = 60 // '<'
- var equalsTo = 61 // '='
- var numberSign = 35 // '#'
- var uppercaseX = 88 // 'X'
- var lowercaseX = 120 // 'x'
- var replacementCharacter = 65533 // '�'
-
- // Reference types.
- var name = 'named'
- var hexa = 'hexadecimal'
- var deci = 'decimal'
-
- // Map of bases.
- var bases = {}
-
- bases[hexa] = 16
- bases[deci] = 10
-
- // Map of types to tests.
- // Each type of character reference accepts different characters.
- // This test is used to detect whether a reference has ended (as the semicolon
- // is not strictly needed).
- var tests = {}
-
- tests[name] = alphanumerical
- tests[deci] = decimal
- tests[hexa] = hexadecimal
-
- // Warning types.
- var namedNotTerminated = 1
- var numericNotTerminated = 2
- var namedEmpty = 3
- var numericEmpty = 4
- var namedUnknown = 5
- var numericDisallowed = 6
- var numericProhibited = 7
-
- // Warning messages.
- var messages = {}
-
- messages[namedNotTerminated] =
- 'Named character references must be terminated by a semicolon'
- messages[numericNotTerminated] =
- 'Numeric character references must be terminated by a semicolon'
- messages[namedEmpty] = 'Named character references cannot be empty'
- messages[numericEmpty] = 'Numeric character references cannot be empty'
- messages[namedUnknown] = 'Named character references must be known'
- messages[numericDisallowed] =
- 'Numeric character references cannot be disallowed'
- messages[numericProhibited] =
- 'Numeric character references cannot be outside the permissible Unicode range'
-
- // Wrap to ensure clean parameters are given to `parse`.
- function parseEntities(value, options) {
- var settings = {}
- var option
- var key
-
- if (!options) {
- options = {}
- }
-
- for (key in defaults) {
- option = options[key]
- settings[key] =
- option === null || option === undefined ? defaults[key] : option
- }
-
- if (settings.position.indent || settings.position.start) {
- settings.indent = settings.position.indent || []
- settings.position = settings.position.start
- }
-
- return parse(value, settings)
- }
-
- // Parse entities.
- // eslint-disable-next-line complexity
- function parse(value, settings) {
- var additional = settings.additional
- var nonTerminated = settings.nonTerminated
- var handleText = settings.text
- var handleReference = settings.reference
- var handleWarning = settings.warning
- var textContext = settings.textContext
- var referenceContext = settings.referenceContext
- var warningContext = settings.warningContext
- var pos = settings.position
- var indent = settings.indent || []
- var length = value.length
- var index = 0
- var lines = -1
- var column = pos.column || 1
- var line = pos.line || 1
- var queue = ''
- var result = []
- var entityCharacters
- var namedEntity
- var terminated
- var characters
- var character
- var reference
- var following
- var warning
- var reason
- var output
- var entity
- var begin
- var start
- var type
- var test
- var prev
- var next
- var diff
- var end
-
- if (typeof additional === 'string') {
- additional = additional.charCodeAt(0)
- }
-
- // Cache the current point.
- prev = now()
-
- // Wrap `handleWarning`.
- warning = handleWarning ? parseError : noop
-
- // Ensure the algorithm walks over the first character and the end
- // (inclusive).
- index--
- length++
-
- while (++index < length) {
- // If the previous character was a newline.
- if (character === lineFeed) {
- column = indent[lines] || 1
- }
-
- character = value.charCodeAt(index)
-
- if (character === ampersand) {
- following = value.charCodeAt(index + 1)
-
- // The behaviour depends on the identity of the next character.
- if (
- following === tab ||
- following === lineFeed ||
- following === formFeed ||
- following === space ||
- following === ampersand ||
- following === lessThan ||
- following !== following ||
- (additional && following === additional)
- ) {
- // Not a character reference.
- // No characters are consumed, and nothing is returned.
- // This is not an error, either.
- queue += fromCharCode(character)
- column++
-
- continue
- }
-
- start = index + 1
- begin = start
- end = start
-
- if (following === numberSign) {
- // Numerical entity.
- end = ++begin
-
- // The behaviour further depends on the next character.
- following = value.charCodeAt(end)
-
- if (following === uppercaseX || following === lowercaseX) {
- // ASCII hex digits.
- type = hexa
- end = ++begin
- } else {
- // ASCII digits.
- type = deci
- }
- } else {
- // Named entity.
- type = name
- }
-
- entityCharacters = ''
- entity = ''
- characters = ''
- test = tests[type]
- end--
-
- while (++end < length) {
- following = value.charCodeAt(end)
-
- if (!test(following)) {
- break
- }
-
- characters += fromCharCode(following)
-
- // Check if we can match a legacy named reference.
- // If so, we cache that as the last viable named reference.
- // This ensures we do not need to walk backwards later.
- if (type === name && own.call(legacy, characters)) {
- entityCharacters = characters
- entity = legacy[characters]
- }
- }
-
- terminated = value.charCodeAt(end) === semicolon
-
- if (terminated) {
- end++
-
- namedEntity = type === name ? decodeEntity(characters) : false
-
- if (namedEntity) {
- entityCharacters = characters
- entity = namedEntity
- }
- }
-
- diff = 1 + end - start
-
- if (!terminated && !nonTerminated) {
- // Empty.
- } else if (!characters) {
- // An empty (possible) entity is valid, unless it’s numeric (thus an
- // ampersand followed by an octothorp).
- if (type !== name) {
- warning(numericEmpty, diff)
- }
- } else if (type === name) {
- // An ampersand followed by anything unknown, and not terminated, is
- // invalid.
- if (terminated && !entity) {
- warning(namedUnknown, 1)
- } else {
- // If theres something after an entity name which is not known, cap
- // the reference.
- if (entityCharacters !== characters) {
- end = begin + entityCharacters.length
- diff = 1 + end - begin
- terminated = false
- }
-
- // If the reference is not terminated, warn.
- if (!terminated) {
- reason = entityCharacters ? namedNotTerminated : namedEmpty
-
- if (settings.attribute) {
- following = value.charCodeAt(end)
-
- if (following === equalsTo) {
- warning(reason, diff)
- entity = null
- } else if (alphanumerical(following)) {
- entity = null
- } else {
- warning(reason, diff)
- }
- } else {
- warning(reason, diff)
- }
- }
- }
-
- reference = entity
- } else {
- if (!terminated) {
- // All non-terminated numeric entities are not rendered, and trigger a
- // warning.
- warning(numericNotTerminated, diff)
- }
-
- // When terminated and number, parse as either hexadecimal or decimal.
- reference = parseInt(characters, bases[type])
-
- // Trigger a warning when the parsed number is prohibited, and replace
- // with replacement character.
- if (prohibited(reference)) {
- warning(numericProhibited, diff)
- reference = fromCharCode(replacementCharacter)
- } else if (reference in invalid) {
- // Trigger a warning when the parsed number is disallowed, and replace
- // by an alternative.
- warning(numericDisallowed, diff)
- reference = invalid[reference]
- } else {
- // Parse the number.
- output = ''
-
- // Trigger a warning when the parsed number should not be used.
- if (disallowed(reference)) {
- warning(numericDisallowed, diff)
- }
-
- // Stringify the number.
- if (reference > 0xffff) {
- reference -= 0x10000
- output += fromCharCode((reference >>> (10 & 0x3ff)) | 0xd800)
- reference = 0xdc00 | (reference & 0x3ff)
- }
-
- reference = output + fromCharCode(reference)
- }
- }
-
- // Found it!
- // First eat the queued characters as normal text, then eat an entity.
- if (reference) {
- flush()
-
- prev = now()
- index = end - 1
- column += end - start + 1
- result.push(reference)
- next = now()
- next.offset++
-
- if (handleReference) {
- handleReference.call(
- referenceContext,
- reference,
- {start: prev, end: next},
- value.slice(start - 1, end)
- )
- }
-
- prev = next
- } else {
- // If we could not find a reference, queue the checked characters (as
- // normal characters), and move the pointer to their end.
- // This is possible because we can be certain neither newlines nor
- // ampersands are included.
- characters = value.slice(start - 1, end)
- queue += characters
- column += characters.length
- index = end - 1
- }
- } else {
- // Handle anything other than an ampersand, including newlines and EOF.
- if (
- character === 10 // Line feed
- ) {
- line++
- lines++
- column = 0
- }
-
- if (character === character) {
- queue += fromCharCode(character)
- column++
- } else {
- flush()
- }
- }
- }
-
- // Return the reduced nodes.
- return result.join('')
-
- // Get current position.
- function now() {
- return {
- line: line,
- column: column,
- offset: index + (pos.offset || 0)
- }
- }
-
- // “Throw” a parse-error: a warning.
- function parseError(code, offset) {
- var position = now()
-
- position.column += offset
- position.offset += offset
-
- handleWarning.call(warningContext, messages[code], position, code)
- }
-
- // Flush `queue` (normal text).
- // Macro invoked before each entity and at the end of `value`.
- // Does nothing when `queue` is empty.
- function flush() {
- if (queue) {
- result.push(queue)
-
- if (handleText) {
- handleText.call(textContext, queue, {start: prev, end: now()})
- }
-
- queue = ''
- }
- }
- }
-
- // Check if `character` is outside the permissible unicode range.
- function prohibited(code) {
- return (code >= 0xd800 && code <= 0xdfff) || code > 0x10ffff
- }
-
- // Check if `character` is disallowed.
- function disallowed(code) {
- return (
- (code >= 0x0001 && code <= 0x0008) ||
- code === 0x000b ||
- (code >= 0x000d && code <= 0x001f) ||
- (code >= 0x007f && code <= 0x009f) ||
- (code >= 0xfdd0 && code <= 0xfdef) ||
- (code & 0xffff) === 0xffff ||
- (code & 0xffff) === 0xfffe
- )
- }
|