You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

etree_lxml.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. """Module for supporting the lxml.etree library. The idea here is to use as much
  2. of the native library as possible, without using fragile hacks like custom element
  3. names that break between releases. The downside of this is that we cannot represent
  4. all possible trees; specifically the following are known to cause problems:
  5. Text or comments as siblings of the root element
  6. Docypes with no name
  7. When any of these things occur, we emit a DataLossWarning
  8. """
  9. from __future__ import absolute_import, division, unicode_literals
  10. # pylint:disable=protected-access
  11. import warnings
  12. import re
  13. import sys
  14. from . import base
  15. from ..constants import DataLossWarning
  16. from .. import constants
  17. from . import etree as etree_builders
  18. from .. import _ihatexml
  19. import lxml.etree as etree
  20. fullTree = True
  21. tag_regexp = re.compile("{([^}]*)}(.*)")
  22. comment_type = etree.Comment("asd").tag
  23. class DocumentType(object):
  24. def __init__(self, name, publicId, systemId):
  25. self.name = name
  26. self.publicId = publicId
  27. self.systemId = systemId
  28. class Document(object):
  29. def __init__(self):
  30. self._elementTree = None
  31. self._childNodes = []
  32. def appendChild(self, element):
  33. self._elementTree.getroot().addnext(element._element)
  34. def _getChildNodes(self):
  35. return self._childNodes
  36. childNodes = property(_getChildNodes)
  37. def testSerializer(element):
  38. rv = []
  39. infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
  40. def serializeElement(element, indent=0):
  41. if not hasattr(element, "tag"):
  42. if hasattr(element, "getroot"):
  43. # Full tree case
  44. rv.append("#document")
  45. if element.docinfo.internalDTD:
  46. if not (element.docinfo.public_id or
  47. element.docinfo.system_url):
  48. dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
  49. else:
  50. dtd_str = """<!DOCTYPE %s "%s" "%s">""" % (
  51. element.docinfo.root_name,
  52. element.docinfo.public_id,
  53. element.docinfo.system_url)
  54. rv.append("|%s%s" % (' ' * (indent + 2), dtd_str))
  55. next_element = element.getroot()
  56. while next_element.getprevious() is not None:
  57. next_element = next_element.getprevious()
  58. while next_element is not None:
  59. serializeElement(next_element, indent + 2)
  60. next_element = next_element.getnext()
  61. elif isinstance(element, str) or isinstance(element, bytes):
  62. # Text in a fragment
  63. assert isinstance(element, str) or sys.version_info[0] == 2
  64. rv.append("|%s\"%s\"" % (' ' * indent, element))
  65. else:
  66. # Fragment case
  67. rv.append("#document-fragment")
  68. for next_element in element:
  69. serializeElement(next_element, indent + 2)
  70. elif element.tag == comment_type:
  71. rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
  72. if hasattr(element, "tail") and element.tail:
  73. rv.append("|%s\"%s\"" % (' ' * indent, element.tail))
  74. else:
  75. assert isinstance(element, etree._Element)
  76. nsmatch = etree_builders.tag_regexp.match(element.tag)
  77. if nsmatch is not None:
  78. ns = nsmatch.group(1)
  79. tag = nsmatch.group(2)
  80. prefix = constants.prefixes[ns]
  81. rv.append("|%s<%s %s>" % (' ' * indent, prefix,
  82. infosetFilter.fromXmlName(tag)))
  83. else:
  84. rv.append("|%s<%s>" % (' ' * indent,
  85. infosetFilter.fromXmlName(element.tag)))
  86. if hasattr(element, "attrib"):
  87. attributes = []
  88. for name, value in element.attrib.items():
  89. nsmatch = tag_regexp.match(name)
  90. if nsmatch is not None:
  91. ns, name = nsmatch.groups()
  92. name = infosetFilter.fromXmlName(name)
  93. prefix = constants.prefixes[ns]
  94. attr_string = "%s %s" % (prefix, name)
  95. else:
  96. attr_string = infosetFilter.fromXmlName(name)
  97. attributes.append((attr_string, value))
  98. for name, value in sorted(attributes):
  99. rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
  100. if element.text:
  101. rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
  102. indent += 2
  103. for child in element:
  104. serializeElement(child, indent)
  105. if hasattr(element, "tail") and element.tail:
  106. rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
  107. serializeElement(element, 0)
  108. return "\n".join(rv)
  109. def tostring(element):
  110. """Serialize an element and its child nodes to a string"""
  111. rv = []
  112. def serializeElement(element):
  113. if not hasattr(element, "tag"):
  114. if element.docinfo.internalDTD:
  115. if element.docinfo.doctype:
  116. dtd_str = element.docinfo.doctype
  117. else:
  118. dtd_str = "<!DOCTYPE %s>" % element.docinfo.root_name
  119. rv.append(dtd_str)
  120. serializeElement(element.getroot())
  121. elif element.tag == comment_type:
  122. rv.append("<!--%s-->" % (element.text,))
  123. else:
  124. # This is assumed to be an ordinary element
  125. if not element.attrib:
  126. rv.append("<%s>" % (element.tag,))
  127. else:
  128. attr = " ".join(["%s=\"%s\"" % (name, value)
  129. for name, value in element.attrib.items()])
  130. rv.append("<%s %s>" % (element.tag, attr))
  131. if element.text:
  132. rv.append(element.text)
  133. for child in element:
  134. serializeElement(child)
  135. rv.append("</%s>" % (element.tag,))
  136. if hasattr(element, "tail") and element.tail:
  137. rv.append(element.tail)
  138. serializeElement(element)
  139. return "".join(rv)
  140. class TreeBuilder(base.TreeBuilder):
  141. documentClass = Document
  142. doctypeClass = DocumentType
  143. elementClass = None
  144. commentClass = None
  145. fragmentClass = Document
  146. implementation = etree
  147. def __init__(self, namespaceHTMLElements, fullTree=False):
  148. builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
  149. infosetFilter = self.infosetFilter = _ihatexml.InfosetFilter(preventDoubleDashComments=True)
  150. self.namespaceHTMLElements = namespaceHTMLElements
  151. class Attributes(dict):
  152. def __init__(self, element, value=None):
  153. if value is None:
  154. value = {}
  155. self._element = element
  156. dict.__init__(self, value) # pylint:disable=non-parent-init-called
  157. for key, value in self.items():
  158. if isinstance(key, tuple):
  159. name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
  160. else:
  161. name = infosetFilter.coerceAttribute(key)
  162. self._element._element.attrib[name] = value
  163. def __setitem__(self, key, value):
  164. dict.__setitem__(self, key, value)
  165. if isinstance(key, tuple):
  166. name = "{%s}%s" % (key[2], infosetFilter.coerceAttribute(key[1]))
  167. else:
  168. name = infosetFilter.coerceAttribute(key)
  169. self._element._element.attrib[name] = value
  170. class Element(builder.Element):
  171. def __init__(self, name, namespace):
  172. name = infosetFilter.coerceElement(name)
  173. builder.Element.__init__(self, name, namespace=namespace)
  174. self._attributes = Attributes(self)
  175. def _setName(self, name):
  176. self._name = infosetFilter.coerceElement(name)
  177. self._element.tag = self._getETreeTag(
  178. self._name, self._namespace)
  179. def _getName(self):
  180. return infosetFilter.fromXmlName(self._name)
  181. name = property(_getName, _setName)
  182. def _getAttributes(self):
  183. return self._attributes
  184. def _setAttributes(self, attributes):
  185. self._attributes = Attributes(self, attributes)
  186. attributes = property(_getAttributes, _setAttributes)
  187. def insertText(self, data, insertBefore=None):
  188. data = infosetFilter.coerceCharacters(data)
  189. builder.Element.insertText(self, data, insertBefore)
  190. def appendChild(self, child):
  191. builder.Element.appendChild(self, child)
  192. class Comment(builder.Comment):
  193. def __init__(self, data):
  194. data = infosetFilter.coerceComment(data)
  195. builder.Comment.__init__(self, data)
  196. def _setData(self, data):
  197. data = infosetFilter.coerceComment(data)
  198. self._element.text = data
  199. def _getData(self):
  200. return self._element.text
  201. data = property(_getData, _setData)
  202. self.elementClass = Element
  203. self.commentClass = Comment
  204. # self.fragmentClass = builder.DocumentFragment
  205. base.TreeBuilder.__init__(self, namespaceHTMLElements)
  206. def reset(self):
  207. base.TreeBuilder.reset(self)
  208. self.insertComment = self.insertCommentInitial
  209. self.initial_comments = []
  210. self.doctype = None
  211. def testSerializer(self, element):
  212. return testSerializer(element)
  213. def getDocument(self):
  214. if fullTree:
  215. return self.document._elementTree
  216. else:
  217. return self.document._elementTree.getroot()
  218. def getFragment(self):
  219. fragment = []
  220. element = self.openElements[0]._element
  221. if element.text:
  222. fragment.append(element.text)
  223. fragment.extend(list(element))
  224. if element.tail:
  225. fragment.append(element.tail)
  226. return fragment
  227. def insertDoctype(self, token):
  228. name = token["name"]
  229. publicId = token["publicId"]
  230. systemId = token["systemId"]
  231. if not name:
  232. warnings.warn("lxml cannot represent empty doctype", DataLossWarning)
  233. self.doctype = None
  234. else:
  235. coercedName = self.infosetFilter.coerceElement(name)
  236. if coercedName != name:
  237. warnings.warn("lxml cannot represent non-xml doctype", DataLossWarning)
  238. doctype = self.doctypeClass(coercedName, publicId, systemId)
  239. self.doctype = doctype
  240. def insertCommentInitial(self, data, parent=None):
  241. assert parent is None or parent is self.document
  242. assert self.document._elementTree is None
  243. self.initial_comments.append(data)
  244. def insertCommentMain(self, data, parent=None):
  245. if (parent == self.document and
  246. self.document._elementTree.getroot()[-1].tag == comment_type):
  247. warnings.warn("lxml cannot represent adjacent comments beyond the root elements", DataLossWarning)
  248. super(TreeBuilder, self).insertComment(data, parent)
  249. def insertRoot(self, token):
  250. # Because of the way libxml2 works, it doesn't seem to be possible to
  251. # alter information like the doctype after the tree has been parsed.
  252. # Therefore we need to use the built-in parser to create our initial
  253. # tree, after which we can add elements like normal
  254. docStr = ""
  255. if self.doctype:
  256. assert self.doctype.name
  257. docStr += "<!DOCTYPE %s" % self.doctype.name
  258. if (self.doctype.publicId is not None or
  259. self.doctype.systemId is not None):
  260. docStr += (' PUBLIC "%s" ' %
  261. (self.infosetFilter.coercePubid(self.doctype.publicId or "")))
  262. if self.doctype.systemId:
  263. sysid = self.doctype.systemId
  264. if sysid.find("'") >= 0 and sysid.find('"') >= 0:
  265. warnings.warn("DOCTYPE system cannot contain single and double quotes", DataLossWarning)
  266. sysid = sysid.replace("'", 'U00027')
  267. if sysid.find("'") >= 0:
  268. docStr += '"%s"' % sysid
  269. else:
  270. docStr += "'%s'" % sysid
  271. else:
  272. docStr += "''"
  273. docStr += ">"
  274. if self.doctype.name != token["name"]:
  275. warnings.warn("lxml cannot represent doctype with a different name to the root element", DataLossWarning)
  276. docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
  277. root = etree.fromstring(docStr)
  278. # Append the initial comments:
  279. for comment_token in self.initial_comments:
  280. comment = self.commentClass(comment_token["data"])
  281. root.addprevious(comment._element)
  282. # Create the root document and add the ElementTree to it
  283. self.document = self.documentClass()
  284. self.document._elementTree = root.getroottree()
  285. # Give the root element the right name
  286. name = token["name"]
  287. namespace = token.get("namespace", self.defaultNamespace)
  288. if namespace is None:
  289. etree_tag = name
  290. else:
  291. etree_tag = "{%s}%s" % (namespace, name)
  292. root.tag = etree_tag
  293. # Add the root element to the internal child/open data structures
  294. root_element = self.elementClass(name, namespace)
  295. root_element._element = root
  296. self.document._childNodes.append(root_element)
  297. self.openElements.append(root_element)
  298. # Reset to the default insert comment function
  299. self.insertComment = self.insertCommentMain