814 lines
22 KiB
JavaScript
Raw Permalink Normal View History

2022-03-10 10:36:59 +01:00
// While micromark is a lexer/tokenizer, the common case of going from markdown
// to html is currently built in as this module, even though the parts can be
// used separately to build ASTs, CSTs, or many other output formats.
//
// Having an HTML compiler built in is useful because it allows us to check for
// compliancy to CommonMark, the de facto norm of markdown, specified in roughly
// 600 input/output cases.
//
// This module has an interface which accepts lists of events instead of the
// whole at once, however, because markdown cant be truly streaming, we buffer
// events before processing and outputting the final result.
export default compileHtml
import decodeEntity from 'parse-entities/decode-entity.js'
import codes from '../character/codes.mjs'
import assign from '../constant/assign.mjs'
import constants from '../constant/constants.mjs'
import own from '../constant/has-own-property.mjs'
import types from '../constant/types.mjs'
import combineHtmlExtensions from '../util/combine-html-extensions.mjs'
import chunkedPush from '../util/chunked-push.mjs'
import miniflat from '../util/miniflat.mjs'
import normalizeIdentifier from '../util/normalize-identifier.mjs'
import normalizeUri from '../util/normalize-uri.mjs'
import safeFromInt from '../util/safe-from-int.mjs'
// This ensures that certain characters which have special meaning in HTML are
// dealt with.
// Technically, we can skip `>` and `"` in many cases, but CM includes them.
var characterReferences = {'"': 'quot', '&': 'amp', '<': 'lt', '>': 'gt'}
// These two are allowlists of essentially safe protocols for full URLs in
// respectively the `href` (on `<a>`) and `src` (on `<img>`) attributes.
// They are based on what is allowed on GitHub,
// <https://github.com/syntax-tree/hast-util-sanitize/blob/9275b21/lib/github.json#L31>
var protocolHref = /^(https?|ircs?|mailto|xmpp)$/i
var protocolSrc = /^https?$/i
function compileHtml(options) {
// Configuration.
// Includes `htmlExtensions` (an array of extensions), `defaultLineEnding` (a
// preferred EOL), `allowDangerousProtocol` (whether to allow potential
// dangerous protocols), and `allowDangerousHtml` (whether to allow potential
// dangerous HTML).
var settings = options || {}
// Tags is needed because according to markdown, links and emphasis and
// whatnot can exist in images, however, as HTML doesnt allow content in
// images, the tags are ignored in the `alt` attribute, but the content
// remains.
var tags = true
// An object to track identifiers to media (URLs and titles) defined with
// definitions.
var definitions = {}
// A lot of the handlers need to capture some of the output data, modify it
// somehow, and then deal with it.
// We do that by tracking a stack of buffers, that can be opened (with
// `buffer`) and closed (with `resume`) to access them.
var buffers = [[]]
// As we can have links in images and the other way around, where the deepest
// ones are closed first, we need to track which one were in.
var mediaStack = []
// Same for tightness, which is specific to lists.
// We need to track if were currently in a tight or loose container.
var tightStack = []
var defaultHandlers = {
enter: {
blockQuote: onenterblockquote,
codeFenced: onentercodefenced,
codeFencedFenceInfo: buffer,
codeFencedFenceMeta: buffer,
codeIndented: onentercodeindented,
codeText: onentercodetext,
content: onentercontent,
definition: onenterdefinition,
definitionDestinationString: onenterdefinitiondestinationstring,
definitionLabelString: buffer,
definitionTitleString: buffer,
emphasis: onenteremphasis,
htmlFlow: onenterhtmlflow,
htmlText: onenterhtml,
image: onenterimage,
label: buffer,
link: onenterlink,
listItemMarker: onenterlistitemmarker,
listItemValue: onenterlistitemvalue,
listOrdered: onenterlistordered,
listUnordered: onenterlistunordered,
paragraph: onenterparagraph,
reference: buffer,
resource: onenterresource,
resourceDestinationString: onenterresourcedestinationstring,
resourceTitleString: buffer,
setextHeading: onentersetextheading,
strong: onenterstrong
},
exit: {
atxHeading: onexitatxheading,
atxHeadingSequence: onexitatxheadingsequence,
autolinkEmail: onexitautolinkemail,
autolinkProtocol: onexitautolinkprotocol,
blockQuote: onexitblockquote,
characterEscapeValue: onexitdata,
characterReferenceMarkerHexadecimal: onexitcharacterreferencemarker,
characterReferenceMarkerNumeric: onexitcharacterreferencemarker,
characterReferenceValue: onexitcharacterreferencevalue,
codeFenced: onexitflowcode,
codeFencedFence: onexitcodefencedfence,
codeFencedFenceInfo: onexitcodefencedfenceinfo,
codeFencedFenceMeta: resume,
codeFlowValue: onexitcodeflowvalue,
codeIndented: onexitflowcode,
codeText: onexitcodetext,
codeTextData: onexitdata,
data: onexitdata,
definition: onexitdefinition,
definitionDestinationString: onexitdefinitiondestinationstring,
definitionLabelString: onexitdefinitionlabelstring,
definitionTitleString: onexitdefinitiontitlestring,
emphasis: onexitemphasis,
hardBreakEscape: onexithardbreak,
hardBreakTrailing: onexithardbreak,
htmlFlow: onexithtml,
htmlFlowData: onexitdata,
htmlText: onexithtml,
htmlTextData: onexitdata,
image: onexitmedia,
label: onexitlabel,
labelText: onexitlabeltext,
lineEnding: onexitlineending,
link: onexitmedia,
listOrdered: onexitlistordered,
listUnordered: onexitlistunordered,
paragraph: onexitparagraph,
reference: resume,
referenceString: onexitreferencestring,
resource: resume,
resourceDestinationString: onexitresourcedestinationstring,
resourceTitleString: onexitresourcetitlestring,
setextHeading: onexitsetextheading,
setextHeadingLineSequence: onexitsetextheadinglinesequence,
setextHeadingText: onexitsetextheadingtext,
strong: onexitstrong,
thematicBreak: onexitthematicbreak
}
}
// Combine the HTML extensions with the default handlers.
// An HTML extension is an object whose fields are either `enter` or `exit`
// (reflecting whether a token is entered or exited).
// The values at such objects are names of tokens mapping to handlers.
// Handlers are called, respectively when a token is opener or closed, with
// that token, and a context as `this`.
var handlers = combineHtmlExtensions(
[defaultHandlers].concat(miniflat(settings.htmlExtensions))
)
// Handlers do often need to keep track of some state.
// That state is provided here as a key-value store (an object).
var data = {tightStack: tightStack}
// The context for handlers references a couple of useful functions.
// In handlers from extensions, those can be accessed at `this`.
// For the handlers here, they can be accessed directly.
var context = {
lineEndingIfNeeded: lineEndingIfNeeded,
options: settings,
encode: encode,
raw: raw,
tag: tag,
buffer: buffer,
resume: resume,
setData: setData,
getData: getData
}
// Generally, micromark copies line endings (`'\r'`, `'\n'`, `'\r\n'`) in the
// markdown document over to the compiled HTML.
// In some cases, such as `> a`, CommonMark requires that extra line endings
// are added: `<blockquote>\n<p>a</p>\n</blockquote>`.
// This variable hold the default line ending when given (or `undefined`),
// and in the latter case will be updated to the first found line ending if
// there is one.
var lineEndingStyle = settings.defaultLineEnding
// Return the function that handles a slice of events.
return compile
// Deal w/ a slice of events.
// Return either the empty string if theres nothing of note to return, or the
// result when done.
function compile(events) {
// As definitions can come after references, we need to figure out the media
// (urls and titles) defined by them before handling the references.
// So, we do sort of what HTML does: put metadata at the start (in head), and
// then put content after (`body`).
var head = []
var body = []
var index
var start
var listStack
var handler
var result
index = -1
start = 0
listStack = []
while (++index < events.length) {
// Figure out the line ending style used in the document.
if (
!lineEndingStyle &&
(events[index][1].type === types.lineEnding ||
events[index][1].type === types.lineEndingBlank)
) {
lineEndingStyle = events[index][2].sliceSerialize(events[index][1])
}
// Preprocess lists to infer whether the list is loose or not.
if (
events[index][1].type === types.listOrdered ||
events[index][1].type === types.listUnordered
) {
if (events[index][0] === 'enter') {
listStack.push(index)
} else {
prepareList(events.slice(listStack.pop(), index))
}
}
// Move definitions to the front.
if (events[index][1].type === types.definition) {
if (events[index][0] === 'enter') {
body = chunkedPush(body, events.slice(start, index))
start = index
} else {
head = chunkedPush(head, events.slice(start, index + 1))
start = index + 1
}
}
}
head = chunkedPush(head, body)
head = chunkedPush(head, events.slice(start))
result = head
index = -1
// Handle the start of the document, if defined.
if (handlers.enter.null) {
handlers.enter.null.call(context)
}
// Handle all events.
while (++index < events.length) {
handler = handlers[result[index][0]]
if (own.call(handler, result[index][1].type)) {
handler[result[index][1].type].call(
assign({sliceSerialize: result[index][2].sliceSerialize}, context),
result[index][1]
)
}
}
// Handle the end of the document, if defined.
if (handlers.exit.null) {
handlers.exit.null.call(context)
}
return buffers[0].join('')
}
// Figure out whether lists are loose or not.
function prepareList(slice) {
var length = slice.length - 1 // Skip close.
var index = 0 // Skip open.
var containerBalance = 0
var loose
var atMarker
var event
while (++index < length) {
event = slice[index]
if (event[1]._container) {
atMarker = undefined
if (event[0] === 'enter') {
containerBalance++
} else {
containerBalance--
}
} else if (event[1].type === types.listItemPrefix) {
if (event[0] === 'exit') {
atMarker = true
}
} else if (event[1].type === types.linePrefix) {
// Ignore
} else if (event[1].type === types.lineEndingBlank) {
if (event[0] === 'enter' && !containerBalance) {
if (atMarker) {
atMarker = undefined
} else {
loose = true
}
}
} else {
atMarker = undefined
}
}
slice[0][1]._loose = loose
}
// Set data into the key-value store.
function setData(key, value) {
data[key] = value
}
// Get data from the key-value store.
function getData(key) {
return data[key]
}
// Capture some of the output data.
function buffer() {
buffers.push([])
}
// Stop capturing and access the output data.
function resume() {
return buffers.pop().join('')
}
// Output (parts of) HTML tags.
function tag(value) {
if (!tags) return
setData('lastWasTag', true)
buffers[buffers.length - 1].push(value)
}
// Output raw data.
function raw(value) {
setData('lastWasTag')
buffers[buffers.length - 1].push(value)
}
// Output an extra line ending.
function lineEnding() {
raw(lineEndingStyle || '\n')
}
// Output an extra line ending if the previous value wasnt EOF/EOL.
function lineEndingIfNeeded() {
var buffer = buffers[buffers.length - 1]
var slice = buffer[buffer.length - 1]
var previous = slice ? slice.charCodeAt(slice.length - 1) : codes.eof
if (
previous === codes.lf ||
previous === codes.cr ||
previous === codes.eof
) {
return
}
lineEnding()
}
// Make a value safe for injection in HTML (except w/ `ignoreEncode`).
function encode(value) {
return getData('ignoreEncode') ? value : value.replace(/["&<>]/g, replace)
function replace(value) {
return '&' + characterReferences[value] + ';'
}
}
// Make a value safe for injection as a URL.
// This does encode unsafe characters with percent-encoding, skipping already
// encoded sequences (`normalizeUri`).
// Further unsafe characters are encoded as character references (`encode`).
// Finally, if the URL includes an unknown protocol (such as a dangerous
// example, `javascript:`), the value is ignored.
function url(url, protocol) {
var value = encode(normalizeUri(url || ''))
var colon = value.indexOf(':')
var questionMark = value.indexOf('?')
var numberSign = value.indexOf('#')
var slash = value.indexOf('/')
if (
settings.allowDangerousProtocol ||
// If there is no protocol, its relative.
colon < 0 ||
// If the first colon is after a `?`, `#`, or `/`, its not a protocol.
(slash > -1 && colon > slash) ||
(questionMark > -1 && colon > questionMark) ||
(numberSign > -1 && colon > numberSign) ||
// It is a protocol, it should be allowed.
protocol.test(value.slice(0, colon))
) {
return value
}
return ''
}
//
// Handlers.
//
function onenterlistordered(token) {
tightStack.push(!token._loose)
lineEndingIfNeeded()
tag('<ol')
setData('expectFirstItem', true)
}
function onenterlistunordered(token) {
tightStack.push(!token._loose)
lineEndingIfNeeded()
tag('<ul')
setData('expectFirstItem', true)
}
function onenterlistitemvalue(token) {
var value
if (getData('expectFirstItem')) {
value = parseInt(this.sliceSerialize(token), constants.numericBaseDecimal)
if (value !== 1) {
tag(' start="' + encode(String(value)) + '"')
}
}
}
function onenterlistitemmarker() {
if (getData('expectFirstItem')) {
tag('>')
} else {
onexitlistitem()
}
lineEndingIfNeeded()
tag('<li>')
setData('expectFirstItem')
// “Hack” to prevent a line ending from showing up if the item is empty.
setData('lastWasTag')
}
function onexitlistordered() {
onexitlistitem()
tightStack.pop()
lineEnding()
tag('</ol>')
}
function onexitlistunordered() {
onexitlistitem()
tightStack.pop()
lineEnding()
tag('</ul>')
}
function onexitlistitem() {
if (getData('lastWasTag') && !getData('slurpAllLineEndings')) {
lineEndingIfNeeded()
}
tag('</li>')
setData('slurpAllLineEndings')
}
function onenterblockquote() {
tightStack.push(false)
lineEndingIfNeeded()
tag('<blockquote>')
}
function onexitblockquote() {
tightStack.pop()
lineEndingIfNeeded()
tag('</blockquote>')
setData('slurpAllLineEndings')
}
function onenterparagraph() {
if (!tightStack[tightStack.length - 1]) {
lineEndingIfNeeded()
tag('<p>')
}
setData('slurpAllLineEndings')
}
function onexitparagraph() {
if (tightStack[tightStack.length - 1]) {
setData('slurpAllLineEndings', true)
} else {
tag('</p>')
}
}
function onentercodefenced() {
lineEndingIfNeeded()
tag('<pre><code')
setData('fencesCount', 0)
}
function onexitcodefencedfenceinfo() {
var value = resume()
tag(' class="language-' + value + '"')
}
function onexitcodefencedfence() {
if (!getData('fencesCount')) {
tag('>')
setData('fencedCodeInside', true)
setData('slurpOneLineEnding', true)
}
setData('fencesCount', getData('fencesCount') + 1)
}
function onentercodeindented() {
lineEndingIfNeeded()
tag('<pre><code>')
}
function onexitflowcode() {
// Send an extra line feed if we saw data.
if (getData('flowCodeSeenData')) lineEndingIfNeeded()
tag('</code></pre>')
if (getData('fencesCount') < 2) lineEndingIfNeeded()
setData('flowCodeSeenData')
setData('fencesCount')
setData('slurpOneLineEnding')
}
function onenterimage() {
mediaStack.push({image: true})
tags = undefined // Disallow tags.
}
function onenterlink() {
mediaStack.push({})
}
function onexitlabeltext(token) {
mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
}
function onexitlabel() {
mediaStack[mediaStack.length - 1].label = resume()
}
function onexitreferencestring(token) {
mediaStack[mediaStack.length - 1].referenceId = this.sliceSerialize(token)
}
function onenterresource() {
buffer() // We can have line endings in the resource, ignore them.
mediaStack[mediaStack.length - 1].destination = ''
}
function onenterresourcedestinationstring() {
buffer()
// Ignore encoding the result, as well first percent encode the url and
// encode manually after.
setData('ignoreEncode', true)
}
function onexitresourcedestinationstring() {
mediaStack[mediaStack.length - 1].destination = resume()
setData('ignoreEncode')
}
function onexitresourcetitlestring() {
mediaStack[mediaStack.length - 1].title = resume()
}
function onexitmedia() {
var index = mediaStack.length - 1 // Skip current.
var media = mediaStack[index]
var context =
media.destination === undefined
? definitions[normalizeIdentifier(media.referenceId || media.labelId)]
: media
tags = true
while (index--) {
if (mediaStack[index].image) {
tags = undefined
break
}
}
if (media.image) {
tag('<img src="' + url(context.destination, protocolSrc) + '" alt="')
raw(media.label)
tag('"')
} else {
tag('<a href="' + url(context.destination, protocolHref) + '"')
}
tag(context.title ? ' title="' + context.title + '"' : '')
if (media.image) {
tag(' />')
} else {
tag('>')
raw(media.label)
tag('</a>')
}
mediaStack.pop()
}
function onenterdefinition() {
buffer()
mediaStack.push({})
}
function onexitdefinitionlabelstring(token) {
// Discard label, use the source content instead.
resume()
mediaStack[mediaStack.length - 1].labelId = this.sliceSerialize(token)
}
function onenterdefinitiondestinationstring() {
buffer()
setData('ignoreEncode', true)
}
function onexitdefinitiondestinationstring() {
mediaStack[mediaStack.length - 1].destination = resume()
setData('ignoreEncode')
}
function onexitdefinitiontitlestring() {
mediaStack[mediaStack.length - 1].title = resume()
}
function onexitdefinition() {
var id = normalizeIdentifier(mediaStack[mediaStack.length - 1].labelId)
resume()
if (!own.call(definitions, id)) {
definitions[id] = mediaStack[mediaStack.length - 1]
}
mediaStack.pop()
}
function onentercontent() {
setData('slurpAllLineEndings', true)
}
function onexitatxheadingsequence(token) {
// Exit for further sequences.
if (getData('headingRank')) return
setData('headingRank', this.sliceSerialize(token).length)
lineEndingIfNeeded()
tag('<h' + getData('headingRank') + '>')
}
function onentersetextheading() {
buffer()
setData('slurpAllLineEndings')
}
function onexitsetextheadingtext() {
setData('slurpAllLineEndings', true)
}
function onexitatxheading() {
tag('</h' + getData('headingRank') + '>')
setData('headingRank')
}
function onexitsetextheadinglinesequence(token) {
setData(
'headingRank',
this.sliceSerialize(token).charCodeAt(0) === codes.equalsTo ? 1 : 2
)
}
function onexitsetextheading() {
var value = resume()
lineEndingIfNeeded()
tag('<h' + getData('headingRank') + '>')
raw(value)
tag('</h' + getData('headingRank') + '>')
setData('slurpAllLineEndings')
setData('headingRank')
}
function onexitdata(token) {
raw(encode(this.sliceSerialize(token)))
}
function onexitlineending(token) {
if (getData('slurpAllLineEndings')) {
return
}
if (getData('slurpOneLineEnding')) {
setData('slurpOneLineEnding')
return
}
if (getData('inCodeText')) {
raw(' ')
return
}
raw(encode(this.sliceSerialize(token)))
}
function onexitcodeflowvalue(token) {
raw(encode(this.sliceSerialize(token)))
setData('flowCodeSeenData', true)
}
function onexithardbreak() {
tag('<br />')
}
function onenterhtmlflow() {
lineEndingIfNeeded()
onenterhtml()
}
function onexithtml() {
setData('ignoreEncode')
}
function onenterhtml() {
if (settings.allowDangerousHtml) {
setData('ignoreEncode', true)
}
}
function onenteremphasis() {
tag('<em>')
}
function onenterstrong() {
tag('<strong>')
}
function onentercodetext() {
setData('inCodeText', true)
tag('<code>')
}
function onexitcodetext() {
setData('inCodeText')
tag('</code>')
}
function onexitemphasis() {
tag('</em>')
}
function onexitstrong() {
tag('</strong>')
}
function onexitthematicbreak() {
lineEndingIfNeeded()
tag('<hr />')
}
function onexitcharacterreferencemarker(token) {
setData('characterReferenceType', token.type)
}
function onexitcharacterreferencevalue(token) {
var value = this.sliceSerialize(token)
value = getData('characterReferenceType')
? safeFromInt(
value,
getData('characterReferenceType') ===
types.characterReferenceMarkerNumeric
? constants.numericBaseDecimal
: constants.numericBaseHexadecimal
)
: decodeEntity(value)
raw(encode(value))
setData('characterReferenceType')
}
function onexitautolinkprotocol(token) {
var uri = this.sliceSerialize(token)
tag('<a href="' + url(uri, protocolHref) + '">')
raw(encode(uri))
tag('</a>')
}
function onexitautolinkemail(token) {
var uri = this.sliceSerialize(token)
tag('<a href="' + url('mailto:' + uri, protocolHref) + '">')
raw(encode(uri))
tag('</a>')
}
}