// While micromark is a lexer/tokenizer, the common case of going from markdown // to html is currently built in as this module, even though the parts can be // used separately to build ASTs, CSTs, or many other output formats. // // Having an HTML compiler built in is useful because it allows us to check for // compliancy to CommonMark, the de facto norm of markdown, specified in roughly // 600 input/output cases. // // This module has an interface which accepts lists of events instead of the // whole at once, however, because markdown can’t be truly streaming, we buffer // events before processing and outputting the final result. export default compileHtml import decodeEntity from 'parse-entities/decode-entity.js' import codes from '../character/codes.mjs' import assign from '../constant/assign.mjs' import constants from '../constant/constants.mjs' import own from '../constant/has-own-property.mjs' import types from '../constant/types.mjs' import combineHtmlExtensions from '../util/combine-html-extensions.mjs' import chunkedPush from '../util/chunked-push.mjs' import miniflat from '../util/miniflat.mjs' import normalizeIdentifier from '../util/normalize-identifier.mjs' import normalizeUri from '../util/normalize-uri.mjs' import safeFromInt from '../util/safe-from-int.mjs' // This ensures that certain characters which have special meaning in HTML are // dealt with. // Technically, we can skip `>` and `"` in many cases, but CM includes them. var characterReferences = {'"': 'quot', '&': 'amp', '<': 'lt', '>': 'gt'} // These two are allowlists of essentially safe protocols for full URLs in // respectively the `href` (on ``) and `src` (on ``) attributes. // They are based on what is allowed on GitHub, // var protocolHref = /^(https?|ircs?|mailto|xmpp)$/i var protocolSrc = /^https?$/i function compileHtml(options) { // Configuration. // Includes `htmlExtensions` (an array of extensions), `defaultLineEnding` (a // preferred EOL), `allowDangerousProtocol` (whether to allow potential // dangerous protocols), and `allowDangerousHtml` (whether to allow potential // dangerous HTML). var settings = options || {} // Tags is needed because according to markdown, links and emphasis and // whatnot can exist in images, however, as HTML doesn’t allow content in // images, the tags are ignored in the `alt` attribute, but the content // remains. var tags = true // An object to track identifiers to media (URLs and titles) defined with // definitions. var definitions = {} // A lot of the handlers need to capture some of the output data, modify it // somehow, and then deal with it. // We do that by tracking a stack of buffers, that can be opened (with // `buffer`) and closed (with `resume`) to access them. var buffers = [[]] // As we can have links in images and the other way around, where the deepest // ones are closed first, we need to track which one we’re in. var mediaStack = [] // Same for tightness, which is specific to lists. // We need to track if we’re currently in a tight or loose container. var tightStack = [] var defaultHandlers = { enter: { blockQuote: onenterblockquote, codeFenced: onentercodefenced, codeFencedFenceInfo: buffer, codeFencedFenceMeta: buffer, codeIndented: onentercodeindented, codeText: onentercodetext, content: onentercontent, definition: onenterdefinition, definitionDestinationString: onenterdefinitiondestinationstring, definitionLabelString: buffer, definitionTitleString: buffer, emphasis: onenteremphasis, htmlFlow: onenterhtmlflow, htmlText: onenterhtml, image: onenterimage, label: buffer, link: onenterlink, listItemMarker: onenterlistitemmarker, listItemValue: onenterlistitemvalue, listOrdered: onenterlistordered, listUnordered: onenterlistunordered, paragraph: onenterparagraph, reference: buffer, resource: onenterresource, resourceDestinationString: onenterresourcedestinationstring, resourceTitleString: buffer, setextHeading: onentersetextheading, strong: onenterstrong }, exit: { atxHeading: onexitatxheading, atxHeadingSequence: onexitatxheadingsequence, autolinkEmail: onexitautolinkemail, autolinkProtocol: onexitautolinkprotocol, blockQuote: onexitblockquote, characterEscapeValue: onexitdata, characterReferenceMarkerHexadecimal: onexitcharacterreferencemarker, characterReferenceMarkerNumeric: onexitcharacterreferencemarker, characterReferenceValue: onexitcharacterreferencevalue, codeFenced: onexitflowcode, codeFencedFence: onexitcodefencedfence, codeFencedFenceInfo: onexitcodefencedfenceinfo, codeFencedFenceMeta: resume, codeFlowValue: onexitcodeflowvalue, codeIndented: onexitflowcode, codeText: onexitcodetext, codeTextData: onexitdata, data: onexitdata, definition: onexitdefinition, definitionDestinationString: onexitdefinitiondestinationstring, definitionLabelString: onexitdefinitionlabelstring, definitionTitleString: onexitdefinitiontitlestring, emphasis: onexitemphasis, hardBreakEscape: onexithardbreak, hardBreakTrailing: onexithardbreak, htmlFlow: onexithtml, htmlFlowData: onexitdata, htmlText: onexithtml, htmlTextData: onexitdata, image: onexitmedia, label: onexitlabel, labelText: onexitlabeltext, lineEnding: onexitlineending, link: onexitmedia, listOrdered: onexitlistordered, listUnordered: onexitlistunordered, paragraph: onexitparagraph, reference: resume, referenceString: onexitreferencestring, resource: resume, resourceDestinationString: onexitresourcedestinationstring, resourceTitleString: onexitresourcetitlestring, setextHeading: onexitsetextheading, setextHeadingLineSequence: onexitsetextheadinglinesequence, setextHeadingText: onexitsetextheadingtext, strong: onexitstrong, thematicBreak: onexitthematicbreak } } // Combine the HTML extensions with the default handlers. // An HTML extension is an object whose fields are either `enter` or `exit` // (reflecting whether a token is entered or exited). // The values at such objects are names of tokens mapping to handlers. // Handlers are called, respectively when a token is opener or closed, with // that token, and a context as `this`. var handlers = combineHtmlExtensions( [defaultHandlers].concat(miniflat(settings.htmlExtensions)) ) // Handlers do often need to keep track of some state. // That state is provided here as a key-value store (an object). var data = {tightStack: tightStack} // The context for handlers references a couple of useful functions. // In handlers from extensions, those can be accessed at `this`. // For the handlers here, they can be accessed directly. var context = { lineEndingIfNeeded: lineEndingIfNeeded, options: settings, encode: encode, raw: raw, tag: tag, buffer: buffer, resume: resume, setData: setData, getData: getData } // Generally, micromark copies line endings (`'\r'`, `'\n'`, `'\r\n'`) in the // markdown document over to the compiled HTML. // In some cases, such as `> a`, CommonMark requires that extra line endings // are added: `

\n
a
\n

`. // This variable hold the default line ending when given (or `undefined`), // and in the latter case will be updated to the first found line ending if // there is one. var lineEndingStyle = settings.defaultLineEnding // Return the function that handles a slice of events. return compile // Deal w/ a slice of events. // Return either the empty string if there’s nothing of note to return, or the // result when done. function compile(events) { // As definitions can come after references, we need to figure out the media // (urls and titles) defined by them before handling the references. // So, we do sort of what HTML does: put metadata at the start (in head), and // then put content after (`body`). var head = [] var body = [] var index var start var listStack var handler var result index = -1 start = 0 listStack = [] while (++index < events.length) { // Figure out the line ending style used in the document. if ( !lineEndingStyle && (events[index][1].type === types.lineEnding || events[index][1].type === types.lineEndingBlank) ) { lineEndingStyle = events[index][2].sliceSerialize(events[index][1]) } // Preprocess lists to infer whether the list is loose or not. if ( events[index][1].type === types.listOrdered || events[index][1].type === types.listUnordered ) { if (events[index][0] === 'enter') { listStack.push(index) } else { prepareList(events.slice(listStack.pop(), index)) } } // Move definitions to the front. if (events[index][1].type === types.definition) { if (events[index][0] === 'enter') { body = chunkedPush(body, events.slice(start, index)) start = index } else { head = chunkedPush(head, events.slice(start, index + 1)) start = index + 1 } } } head = chunkedPush(head, body) head = chunkedPush(head, events.slice(start)) result = head index = -1 // Handle the start of the document, if defined. if (handlers.enter.null) { handlers.enter.null.call(context) } // Handle all events. while (++index < events.length) { handler = handlers[result[index][0]] if (own.call(handler, result[index][1].type)) { handler[result[index][1].type].call( assign({sliceSerialize: result[index][2].sliceSerialize}, context), result[index][1] ) } } // Handle the end of the document, if defined. if (handlers.exit.null) { handlers.exit.null.call(context) } return buffers[0].join('') } // Figure out whether lists are loose or not. function prepareList(slice) { var length = slice.length - 1 // Skip close. var index = 0 // Skip open. var containerBalance = 0 var loose var atMarker var event while (++index < length) { event = slice[index] if (event[1]._container) { atMarker = undefined if (event[0] === 'enter') { containerBalance++ } else { containerBalance-- } } else if (event[1].type === types.listItemPrefix) { if (event[0] === 'exit') { atMarker = true } } else if (event[1].type === types.linePrefix) { // Ignore } else if (event[1].type === types.lineEndingBlank) { if (event[0] === 'enter' && !containerBalance) { if (atMarker) { atMarker = undefined } else { loose = true } } } else { atMarker = undefined } } slice[0][1]._loose = loose } // Set data into the key-value store. function setData(key, value) { data[key] = value } // Get data from the key-value store. function getData(key) { return data[key] } // Capture some of the output data. function buffer() { buffers.push([]) } // Stop capturing and access the output data. function resume() { return buffers.pop().join('') } // Output (parts of) HTML tags. function tag(value) { if (!tags) return setData('lastWasTag', true) buffers[buffers.length - 1].push(value) } // Output raw data. function raw(value) { setData('lastWasTag') buffers[buffers.length - 1].push(value) } // Output an extra line ending. function lineEnding() { raw(lineEndingStyle || '\n') } // Output an extra line ending if the previous value wasn’t EOF/EOL. function lineEndingIfNeeded() { var buffer = buffers[buffers.length - 1] var slice = buffer[buffer.length - 1] var previous = slice ? slice.charCodeAt(slice.length - 1) : codes.eof if ( previous === codes.lf || previous === codes.cr || previous === codes.eof ) { return } lineEnding() } // Make a value safe for injection in HTML (except w/ `ignoreEncode`). function encode(value) { return getData('ignoreEncode') ? value : value.replace(/["&<>]/g, replace) function replace(value) { return '&' + characterReferences[value] + ';' } } // Make a value safe for injection as a URL. // This does encode unsafe characters with percent-encoding, skipping already // encoded sequences (`normalizeUri`). // Further unsafe characters are encoded as character references (`encode`). // Finally, if the URL includes an unknown protocol (such as a dangerous // example, `javascript:`), the value is ignored. function url(url, protocol) { var value = encode(normalizeUri(url || '')) var colon = value.indexOf(':') var questionMark = value.indexOf('?') var numberSign = value.indexOf('#') var slash = value.indexOf('/') if ( settings.allowDangerousProtocol || // If there is no protocol, it’s relative. colon < 0 || // If the first colon is after a `?`, `#`, or `/`, it’s not a protocol. (slash > -1 && colon > slash) || (questionMark > -1 && colon > questionMark) || (numberSign > -1 && colon > numberSign) || // It is a protocol, it should be allowed. protocol.test(value.slice(0, colon)) ) { return value } return '' } // // Handlers. // function onenterlistordered(token) { tightStack.push(!token._loose) lineEndingIfNeeded() tag('') } else { onexitlistitem() } lineEndingIfNeeded() tag('

') setData('expectFirstItem') // “Hack” to prevent a line ending from showing up if the item is empty. setData('lastWasTag') } function onexitlistordered() { onexitlistitem() tightStack.pop() lineEnding() tag('') } function onexitlistunordered() { onexitlistitem() tightStack.pop() lineEnding() tag('') } function onexitlistitem() { if (getData('lastWasTag') && !getData('slurpAllLineEndings')) { lineEndingIfNeeded() } tag('

') setData('slurpAllLineEndings') } function onenterblockquote() { tightStack.push(false) lineEndingIfNeeded() tag('

') } function onexitblockquote() { tightStack.pop() lineEndingIfNeeded() tag('

') setData('slurpAllLineEndings') } function onenterparagraph() { if (!tightStack[tightStack.length - 1]) { lineEndingIfNeeded() tag('

') } setData('slurpAllLineEndings') } function onexitparagraph() { if (tightStack[tightStack.length - 1]) { setData('slurpAllLineEndings', true) } else { tag('

') } } function onentercodefenced() { lineEndingIfNeeded() tag('

')
      setData('fencedCodeInside', true)
      setData('slurpOneLineEnding', true)
    }

    setData('fencesCount', getData('fencesCount') + 1)
  }

  function onentercodeindented() {
    lineEndingIfNeeded()
    tag('')
  }

  function onexitflowcode() {
    // Send an extra line feed if we saw data.
    if (getData('flowCodeSeenData')) lineEndingIfNeeded()
    tag('')
    if (getData('fencesCount') < 2) lineEndingIfNeeded()
    setData('flowCodeSeenData')
    setData('fencesCount')
    setData('slurpOneLineEnding')
  }

  function onenterimage() {
    mediaStack.push({image: true})
    tags = undefined // Disallow tags.
  }

  function onenterlink() {
    mediaStack.push({})
  }

  function onexitlabeltext(token) {
    mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
  }

  function onexitlabel() {
    mediaStack[mediaStack.length - 1].label = resume()
  }

  function onexitreferencestring(token) {
    mediaStack[mediaStack.length - 1].referenceId = this.sliceSerialize(token)
  }

  function onenterresource() {
    buffer() // We can have line endings in the resource, ignore them.
    mediaStack[mediaStack.length - 1].destination = ''
  }

  function onenterresourcedestinationstring() {
    buffer()
    // Ignore encoding the result, as we’ll first percent encode the url and
    // encode manually after.
    setData('ignoreEncode', true)
  }

  function onexitresourcedestinationstring() {
    mediaStack[mediaStack.length - 1].destination = resume()
    setData('ignoreEncode')
  }

  function onexitresourcetitlestring() {
    mediaStack[mediaStack.length - 1].title = resume()
  }

  function onexitmedia() {
    var index = mediaStack.length - 1 // Skip current.
    var media = mediaStack[index]
    var context =
      media.destination === undefined
        ? definitions[normalizeIdentifier(media.referenceId || media.labelId)]
        : media

    tags = true

    while (index--) {
      if (mediaStack[index].image) {
        tags = undefined
        break
      }
    }

    if (media.image) {
      tag('')
    } else {
      tag('>')
      raw(media.label)
      tag('')
    }

    mediaStack.pop()
  }

  function onenterdefinition() {
    buffer()
    mediaStack.push({})
  }

  function onexitdefinitionlabelstring(token) {
    // Discard label, use the source content instead.
    resume()
    mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
  }

  function onenterdefinitiondestinationstring() {
    buffer()
    setData('ignoreEncode', true)
  }

  function onexitdefinitiondestinationstring() {
    mediaStack[mediaStack.length - 1].destination = resume()
    setData('ignoreEncode')
  }

  function onexitdefinitiontitlestring() {
    mediaStack[mediaStack.length - 1].title = resume()
  }

  function onexitdefinition() {
    var id = normalizeIdentifier(mediaStack[mediaStack.length - 1].labelId)

    resume()

    if (!own.call(definitions, id)) {
      definitions[id] = mediaStack[mediaStack.length - 1]
    }

    mediaStack.pop()
  }

  function onentercontent() {
    setData('slurpAllLineEndings', true)
  }

  function onexitatxheadingsequence(token) {
    // Exit for further sequences.
    if (getData('headingRank')) return
    setData('headingRank', this.sliceSerialize(token).length)
    lineEndingIfNeeded()
    tag('')
  }

  function onentersetextheading() {
    buffer()
    setData('slurpAllLineEndings')
  }

  function onexitsetextheadingtext() {
    setData('slurpAllLineEndings', true)
  }

  function onexitatxheading() {
    tag('')
    setData('headingRank')
  }

  function onexitsetextheadinglinesequence(token) {
    setData(
      'headingRank',
      this.sliceSerialize(token).charCodeAt(0) === codes.equalsTo ? 1 : 2
    )
  }

  function onexitsetextheading() {
    var value = resume()
    lineEndingIfNeeded()
    tag('')
    raw(value)
    tag('')
    setData('slurpAllLineEndings')
    setData('headingRank')
  }

  function onexitdata(token) {
    raw(encode(this.sliceSerialize(token)))
  }

  function onexitlineending(token) {
    if (getData('slurpAllLineEndings')) {
      return
    }

    if (getData('slurpOneLineEnding')) {
      setData('slurpOneLineEnding')
      return
    }

    if (getData('inCodeText')) {
      raw(' ')
      return
    }

    raw(encode(this.sliceSerialize(token)))
  }

  function onexitcodeflowvalue(token) {
    raw(encode(this.sliceSerialize(token)))
    setData('flowCodeSeenData', true)
  }

  function onexithardbreak() {
    tag('
')
  }

  function onenterhtmlflow() {
    lineEndingIfNeeded()
    onenterhtml()
  }

  function onexithtml() {
    setData('ignoreEncode')
  }

  function onenterhtml() {
    if (settings.allowDangerousHtml) {
      setData('ignoreEncode', true)
    }
  }

  function onenteremphasis() {
    tag('')
  }

  function onenterstrong() {
    tag('')
  }

  function onentercodetext() {
    setData('inCodeText', true)
    tag('')
  }

  function onexitcodetext() {
    setData('inCodeText')
    tag('')
  }

  function onexitemphasis() {
    tag('')
  }

  function onexitstrong() {
    tag('')
  }

  function onexitthematicbreak() {
    lineEndingIfNeeded()
    tag('')
  }

  function onexitcharacterreferencemarker(token) {
    setData('characterReferenceType', token.type)
  }

  function onexitcharacterreferencevalue(token) {
    var value = this.sliceSerialize(token)

    value = getData('characterReferenceType')
      ? safeFromInt(
          value,
          getData('characterReferenceType') ===
            types.characterReferenceMarkerNumeric
            ? constants.numericBaseDecimal
            : constants.numericBaseHexadecimal
        )
      : decodeEntity(value)

    raw(encode(value))
    setData('characterReferenceType')
  }

  function onexitautolinkprotocol(token) {
    var uri = this.sliceSerialize(token)
    tag('')
    raw(encode(uri))
    tag('')
  }

  function onexitautolinkemail(token) {
    var uri = this.sliceSerialize(token)
    tag('')
    raw(encode(uri))
    tag('')
  }
}