123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645 |
- # -*- test-case-name: twisted.web.test.test_xml -*-
- #
- # Copyright (c) Twisted Matrix Laboratories.
- # See LICENSE for details.
-
-
- """
- *S*mall, *U*ncomplicated *X*ML.
-
- This is a very simple implementation of XML/HTML as a network
- protocol. It is not at all clever. Its main features are that it
- does not:
-
- - support namespaces
- - mung mnemonic entity references
- - validate
- - perform *any* external actions (such as fetching URLs or writing files)
- under *any* circumstances
- - has lots and lots of horrible hacks for supporting broken HTML (as an
- option, they're not on by default).
- """
-
-
- from twisted.internet.protocol import Protocol
- from twisted.python.reflect import prefixedMethodNames
-
- # Elements of the three-tuples in the state table.
- BEGIN_HANDLER = 0
- DO_HANDLER = 1
- END_HANDLER = 2
-
- identChars = ".-_:"
- lenientIdentChars = identChars + ";+#/%~"
-
-
- def nop(*args, **kw):
- "Do nothing."
-
-
- def unionlist(*args):
- l = []
- for x in args:
- l.extend(x)
- d = {x: 1 for x in l}
- return d.keys()
-
-
- def zipfndict(*args, **kw):
- default = kw.get("default", nop)
- d = {}
- for key in unionlist(*(fndict.keys() for fndict in args)):
- d[key] = tuple(x.get(key, default) for x in args)
- return d
-
-
- def prefixedMethodClassDict(clazz, prefix):
- return {
- name: getattr(clazz, prefix + name)
- for name in prefixedMethodNames(clazz, prefix)
- }
-
-
- def prefixedMethodObjDict(obj, prefix):
- return {
- name: getattr(obj, prefix + name)
- for name in prefixedMethodNames(obj.__class__, prefix)
- }
-
-
- class ParseError(Exception):
- def __init__(self, filename, line, col, message):
- self.filename = filename
- self.line = line
- self.col = col
- self.message = message
-
- def __str__(self) -> str:
- return f"{self.filename}:{self.line}:{self.col}: {self.message}"
-
-
- class XMLParser(Protocol):
-
- state = None
- encodings = None
- filename = "<xml />"
- beExtremelyLenient = 0
- _prepend = None
-
- # _leadingBodyData will sometimes be set before switching to the
- # 'bodydata' state, when we "accidentally" read a byte of bodydata
- # in a different state.
- _leadingBodyData = None
-
- def connectionMade(self):
- self.lineno = 1
- self.colno = 0
- self.encodings = []
-
- def saveMark(self):
- """Get the line number and column of the last character parsed"""
- # This gets replaced during dataReceived, restored afterwards
- return (self.lineno, self.colno)
-
- def _parseError(self, message):
- raise ParseError(*((self.filename,) + self.saveMark() + (message,)))
-
- def _buildStateTable(self):
- """Return a dictionary of begin, do, end state function tuples"""
- # _buildStateTable leaves something to be desired but it does what it
- # does.. probably slowly, so I'm doing some evil caching so it doesn't
- # get called more than once per class.
- stateTable = getattr(self.__class__, "__stateTable", None)
- if stateTable is None:
- stateTable = self.__class__.__stateTable = zipfndict(
- *(
- prefixedMethodObjDict(self, prefix)
- for prefix in ("begin_", "do_", "end_")
- )
- )
- return stateTable
-
- def _decode(self, data):
- if "UTF-16" in self.encodings or "UCS-2" in self.encodings:
- assert not len(data) & 1, "UTF-16 must come in pairs for now"
- if self._prepend:
- data = self._prepend + data
- for encoding in self.encodings:
- data = str(data, encoding)
- return data
-
- def maybeBodyData(self):
- if self.endtag:
- return "bodydata"
-
- # Get ready for fun! We're going to allow
- # <script>if (foo < bar)</script> to work!
- # We do this by making everything between <script> and
- # </script> a Text
- # BUT <script src="foo"> will be special-cased to do regular,
- # lenient behavior, because those may not have </script>
- # -radix
-
- if self.tagName == "script" and "src" not in self.tagAttributes:
- # we do this ourselves rather than having begin_waitforendscript
- # because that can get called multiple times and we don't want
- # bodydata to get reset other than the first time.
- self.begin_bodydata(None)
- return "waitforendscript"
- return "bodydata"
-
- def dataReceived(self, data):
- stateTable = self._buildStateTable()
- if not self.state:
- # all UTF-16 starts with this string
- if data.startswith((b"\xff\xfe", b"\xfe\xff")):
- self._prepend = data[0:2]
- self.encodings.append("UTF-16")
- data = data[2:]
- self.state = "begin"
- if self.encodings:
- data = self._decode(data)
- else:
- data = data.decode("utf-8")
- # bring state, lineno, colno into local scope
- lineno, colno = self.lineno, self.colno
- curState = self.state
- # replace saveMark with a nested scope function
- _saveMark = self.saveMark
-
- def saveMark():
- return (lineno, colno)
-
- self.saveMark = saveMark
- # fetch functions from the stateTable
- beginFn, doFn, endFn = stateTable[curState]
- try:
- for byte in data:
- # do newline stuff
- if byte == "\n":
- lineno += 1
- colno = 0
- else:
- colno += 1
- newState = doFn(byte)
- if newState is not None and newState != curState:
- # this is the endFn from the previous state
- endFn()
- curState = newState
- beginFn, doFn, endFn = stateTable[curState]
- beginFn(byte)
- finally:
- self.saveMark = _saveMark
- self.lineno, self.colno = lineno, colno
- # state doesn't make sense if there's an exception..
- self.state = curState
-
- def connectionLost(self, reason):
- """
- End the last state we were in.
- """
- stateTable = self._buildStateTable()
- stateTable[self.state][END_HANDLER]()
-
- # state methods
-
- def do_begin(self, byte):
- if byte.isspace():
- return
- if byte != "<":
- if self.beExtremelyLenient:
- self._leadingBodyData = byte
- return "bodydata"
- self._parseError(f"First char of document [{byte!r}] wasn't <")
- return "tagstart"
-
- def begin_comment(self, byte):
- self.commentbuf = ""
-
- def do_comment(self, byte):
- self.commentbuf += byte
- if self.commentbuf.endswith("-->"):
- self.gotComment(self.commentbuf[:-3])
- return "bodydata"
-
- def begin_tagstart(self, byte):
- self.tagName = "" # name of the tag
- self.tagAttributes = {} # attributes of the tag
- self.termtag = 0 # is the tag self-terminating
- self.endtag = 0
-
- def do_tagstart(self, byte):
- if byte.isalnum() or byte in identChars:
- self.tagName += byte
- if self.tagName == "!--":
- return "comment"
- elif byte.isspace():
- if self.tagName:
- if self.endtag:
- # properly strict thing to do here is probably to only
- # accept whitespace
- return "waitforgt"
- return "attrs"
- else:
- self._parseError("Whitespace before tag-name")
- elif byte == ">":
- if self.endtag:
- self.gotTagEnd(self.tagName)
- return "bodydata"
- else:
- self.gotTagStart(self.tagName, {})
- return (
- (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
- )
- elif byte == "/":
- if self.tagName:
- return "afterslash"
- else:
- self.endtag = 1
- elif byte in "!?":
- if self.tagName:
- if not self.beExtremelyLenient:
- self._parseError("Invalid character in tag-name")
- else:
- self.tagName += byte
- self.termtag = 1
- elif byte == "[":
- if self.tagName == "!":
- return "expectcdata"
- else:
- self._parseError("Invalid '[' in tag-name")
- else:
- if self.beExtremelyLenient:
- self.bodydata = "<"
- return "unentity"
- self._parseError("Invalid tag character: %r" % byte)
-
- def begin_unentity(self, byte):
- self.bodydata += byte
-
- def do_unentity(self, byte):
- self.bodydata += byte
- return "bodydata"
-
- def end_unentity(self):
- self.gotText(self.bodydata)
-
- def begin_expectcdata(self, byte):
- self.cdatabuf = byte
-
- def do_expectcdata(self, byte):
- self.cdatabuf += byte
- cdb = self.cdatabuf
- cd = "[CDATA["
- if len(cd) > len(cdb):
- if cd.startswith(cdb):
- return
- elif self.beExtremelyLenient:
- ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
- ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
- ## 'em as best I can. this should really be a separate parse
- ## state but I don't even have any idea what these _are_.
- return "waitforgt"
- else:
- self._parseError("Mal-formed CDATA header")
- if cd == cdb:
- self.cdatabuf = ""
- return "cdata"
- self._parseError("Mal-formed CDATA header")
-
- def do_cdata(self, byte):
- self.cdatabuf += byte
- if self.cdatabuf.endswith("]]>"):
- self.cdatabuf = self.cdatabuf[:-3]
- return "bodydata"
-
- def end_cdata(self):
- self.gotCData(self.cdatabuf)
- self.cdatabuf = ""
-
- def do_attrs(self, byte):
- if byte.isalnum() or byte in identChars:
- # XXX FIXME really handle !DOCTYPE at some point
- if self.tagName == "!DOCTYPE":
- return "doctype"
- if self.tagName[0] in "!?":
- return "waitforgt"
- return "attrname"
- elif byte.isspace():
- return
- elif byte == ">":
- self.gotTagStart(self.tagName, self.tagAttributes)
- return (not self.beExtremelyLenient) and "bodydata" or self.maybeBodyData()
- elif byte == "/":
- return "afterslash"
- elif self.beExtremelyLenient:
- # discard and move on? Only case I've seen of this so far was:
- # <foo bar="baz"">
- return
- self._parseError("Unexpected character: %r" % byte)
-
- def begin_doctype(self, byte):
- self.doctype = byte
-
- def do_doctype(self, byte):
- if byte == ">":
- return "bodydata"
- self.doctype += byte
-
- def end_doctype(self):
- self.gotDoctype(self.doctype)
- self.doctype = None
-
- def do_waitforgt(self, byte):
- if byte == ">":
- if self.endtag or not self.beExtremelyLenient:
- return "bodydata"
- return self.maybeBodyData()
-
- def begin_attrname(self, byte):
- self.attrname = byte
- self._attrname_termtag = 0
-
- def do_attrname(self, byte):
- if byte.isalnum() or byte in identChars:
- self.attrname += byte
- return
- elif byte == "=":
- return "beforeattrval"
- elif byte.isspace():
- return "beforeeq"
- elif self.beExtremelyLenient:
- if byte in "\"'":
- return "attrval"
- if byte in lenientIdentChars or byte.isalnum():
- self.attrname += byte
- return
- if byte == "/":
- self._attrname_termtag = 1
- return
- if byte == ">":
- self.attrval = "True"
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if self._attrname_termtag:
- self.gotTagEnd(self.tagName)
- return "bodydata"
- return self.maybeBodyData()
- # something is really broken. let's leave this attribute where it
- # is and move on to the next thing
- return
- self._parseError(f"Invalid attribute name: {self.attrname!r} {byte!r}")
-
- def do_beforeattrval(self, byte):
- if byte in "\"'":
- return "attrval"
- elif byte.isspace():
- return
- elif self.beExtremelyLenient:
- if byte in lenientIdentChars or byte.isalnum():
- return "messyattr"
- if byte == ">":
- self.attrval = "True"
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- return self.maybeBodyData()
- if byte == "\\":
- # I saw this in actual HTML once:
- # <font size=\"3\"><sup>SM</sup></font>
- return
- self._parseError(
- "Invalid initial attribute value: %r; Attribute values must be quoted."
- % byte
- )
-
- attrname = ""
- attrval = ""
-
- def begin_beforeeq(self, byte):
- self._beforeeq_termtag = 0
-
- def do_beforeeq(self, byte):
- if byte == "=":
- return "beforeattrval"
- elif byte.isspace():
- return
- elif self.beExtremelyLenient:
- if byte.isalnum() or byte in identChars:
- self.attrval = "True"
- self.tagAttributes[self.attrname] = self.attrval
- return "attrname"
- elif byte == ">":
- self.attrval = "True"
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if self._beforeeq_termtag:
- self.gotTagEnd(self.tagName)
- return "bodydata"
- return self.maybeBodyData()
- elif byte == "/":
- self._beforeeq_termtag = 1
- return
- self._parseError("Invalid attribute")
-
- def begin_attrval(self, byte):
- self.quotetype = byte
- self.attrval = ""
-
- def do_attrval(self, byte):
- if byte == self.quotetype:
- return "attrs"
- self.attrval += byte
-
- def end_attrval(self):
- self.tagAttributes[self.attrname] = self.attrval
- self.attrname = self.attrval = ""
-
- def begin_messyattr(self, byte):
- self.attrval = byte
-
- def do_messyattr(self, byte):
- if byte.isspace():
- return "attrs"
- elif byte == ">":
- endTag = 0
- if self.attrval.endswith("/"):
- endTag = 1
- self.attrval = self.attrval[:-1]
- self.tagAttributes[self.attrname] = self.attrval
- self.gotTagStart(self.tagName, self.tagAttributes)
- if endTag:
- self.gotTagEnd(self.tagName)
- return "bodydata"
- return self.maybeBodyData()
- else:
- self.attrval += byte
-
- def end_messyattr(self):
- if self.attrval:
- self.tagAttributes[self.attrname] = self.attrval
-
- def begin_afterslash(self, byte):
- self._after_slash_closed = 0
-
- def do_afterslash(self, byte):
- # this state is only after a self-terminating slash, e.g. <foo/>
- if self._after_slash_closed:
- self._parseError("Mal-formed") # XXX When does this happen??
- if byte != ">":
- if self.beExtremelyLenient:
- return
- else:
- self._parseError("No data allowed after '/'")
- self._after_slash_closed = 1
- self.gotTagStart(self.tagName, self.tagAttributes)
- self.gotTagEnd(self.tagName)
- # don't need maybeBodyData here because there better not be
- # any javascript code after a <script/>... we'll see :(
- return "bodydata"
-
- def begin_bodydata(self, byte):
- if self._leadingBodyData:
- self.bodydata = self._leadingBodyData
- del self._leadingBodyData
- else:
- self.bodydata = ""
-
- def do_bodydata(self, byte):
- if byte == "<":
- return "tagstart"
- if byte == "&":
- return "entityref"
- self.bodydata += byte
-
- def end_bodydata(self):
- self.gotText(self.bodydata)
- self.bodydata = ""
-
- def do_waitforendscript(self, byte):
- if byte == "<":
- return "waitscriptendtag"
- self.bodydata += byte
-
- def begin_waitscriptendtag(self, byte):
- self.temptagdata = ""
- self.tagName = ""
- self.endtag = 0
-
- def do_waitscriptendtag(self, byte):
- # 1 enforce / as first byte read
- # 2 enforce following bytes to be subset of "script" until
- # tagName == "script"
- # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
- # 3 spaces can happen anywhere, they're ignored
- # e.g. < / script >
- # 4 anything else causes all data I've read to be moved to the
- # bodydata, and switch back to waitforendscript state
-
- # If it turns out this _isn't_ a </script>, we need to
- # remember all the data we've been through so we can append it
- # to bodydata
- self.temptagdata += byte
-
- # 1
- if byte == "/":
- self.endtag = True
- elif not self.endtag:
- self.bodydata += "<" + self.temptagdata
- return "waitforendscript"
- # 2
- elif byte.isalnum() or byte in identChars:
- self.tagName += byte
- if not "script".startswith(self.tagName):
- self.bodydata += "<" + self.temptagdata
- return "waitforendscript"
- elif self.tagName == "script":
- self.gotText(self.bodydata)
- self.gotTagEnd(self.tagName)
- return "waitforgt"
- # 3
- elif byte.isspace():
- return "waitscriptendtag"
- # 4
- else:
- self.bodydata += "<" + self.temptagdata
- return "waitforendscript"
-
- def begin_entityref(self, byte):
- self.erefbuf = ""
- self.erefextra = "" # extra bit for lenient mode
-
- def do_entityref(self, byte):
- if byte.isspace() or byte == "<":
- if self.beExtremelyLenient:
- # '&foo' probably was '&foo'
- if self.erefbuf and self.erefbuf != "amp":
- self.erefextra = self.erefbuf
- self.erefbuf = "amp"
- if byte == "<":
- return "tagstart"
- else:
- self.erefextra += byte
- return "spacebodydata"
- self._parseError("Bad entity reference")
- elif byte != ";":
- self.erefbuf += byte
- else:
- return "bodydata"
-
- def end_entityref(self):
- self.gotEntityReference(self.erefbuf)
-
- # hacky support for space after & in entityref in beExtremelyLenient
- # state should only happen in that case
- def begin_spacebodydata(self, byte):
- self.bodydata = self.erefextra
- self.erefextra = None
-
- do_spacebodydata = do_bodydata
- end_spacebodydata = end_bodydata
-
- # Sorta SAX-ish API
-
- def gotTagStart(self, name, attributes):
- """Encountered an opening tag.
-
- Default behaviour is to print."""
- print("begin", name, attributes)
-
- def gotText(self, data):
- """Encountered text
-
- Default behaviour is to print."""
- print("text:", repr(data))
-
- def gotEntityReference(self, entityRef):
- """Encountered mnemonic entity reference
-
- Default behaviour is to print."""
- print("entityRef: &%s;" % entityRef)
-
- def gotComment(self, comment):
- """Encountered comment.
-
- Default behaviour is to ignore."""
- pass
-
- def gotCData(self, cdata):
- """Encountered CDATA
-
- Default behaviour is to call the gotText method"""
- self.gotText(cdata)
-
- def gotDoctype(self, doctype):
- """Encountered DOCTYPE
-
- This is really grotty: it basically just gives you everything between
- '<!DOCTYPE' and '>' as an argument.
- """
- print("!DOCTYPE", repr(doctype))
-
- def gotTagEnd(self, name):
- """Encountered closing tag
-
- Default behaviour is to print."""
- print("end", name)
|