You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

etree_lxml.py 6.2KB


  1. from __future__ import absolute_import, division, unicode_literals
  2. from pip._vendor.six import text_type
  3. from lxml import etree
  4. from ..treebuilders.etree import tag_regexp
  5. from . import base
  6. from .. import _ihatexml
  7. def ensure_str(s):
  8. if s is None:
  9. return None
  10. elif isinstance(s, text_type):
  11. return s
  12. else:
  13. return s.decode("ascii", "strict")
  14. class Root(object):
  15. def __init__(self, et):
  16. self.elementtree = et
  17. self.children = []
  18. try:
  19. if et.docinfo.internalDTD:
  20. self.children.append(Doctype(self,
  21. ensure_str(et.docinfo.root_name),
  22. ensure_str(et.docinfo.public_id),
  23. ensure_str(et.docinfo.system_url)))
  24. except AttributeError:
  25. pass
  26. try:
  27. node = et.getroot()
  28. except AttributeError:
  29. node = et
  30. while node.getprevious() is not None:
  31. node = node.getprevious()
  32. while node is not None:
  33. self.children.append(node)
  34. node = node.getnext()
  35. self.text = None
  36. self.tail = None
  37. def __getitem__(self, key):
  38. return self.children[key]
  39. def getnext(self):
  40. return None
  41. def __len__(self):
  42. return 1
  43. class Doctype(object):
  44. def __init__(self, root_node, name, public_id, system_id):
  45. self.root_node = root_node
  46. self.name = name
  47. self.public_id = public_id
  48. self.system_id = system_id
  49. self.text = None
  50. self.tail = None
  51. def getnext(self):
  52. return self.root_node.children[1]
  53. class FragmentRoot(Root):
  54. def __init__(self, children):
  55. self.children = [FragmentWrapper(self, child) for child in children]
  56. self.text = self.tail = None
  57. def getnext(self):
  58. return None
  59. class FragmentWrapper(object):
  60. def __init__(self, fragment_root, obj):
  61. self.root_node = fragment_root
  62. self.obj = obj
  63. if hasattr(self.obj, 'text'):
  64. self.text = ensure_str(self.obj.text)
  65. else:
  66. self.text = None
  67. if hasattr(self.obj, 'tail'):
  68. self.tail = ensure_str(self.obj.tail)
  69. else:
  70. self.tail = None
  71. def __getattr__(self, name):
  72. return getattr(self.obj, name)
  73. def getnext(self):
  74. siblings = self.root_node.children
  75. idx = siblings.index(self)
  76. if idx < len(siblings) - 1:
  77. return siblings[idx + 1]
  78. else:
  79. return None
  80. def __getitem__(self, key):
  81. return self.obj[key]
  82. def __bool__(self):
  83. return bool(self.obj)
  84. def getparent(self):
  85. return None
  86. def __str__(self):
  87. return str(self.obj)
  88. def __unicode__(self):
  89. return str(self.obj)
  90. def __len__(self):
  91. return len(self.obj)
  92. class TreeWalker(base.NonRecursiveTreeWalker):
  93. def __init__(self, tree):
  94. # pylint:disable=redefined-variable-type
  95. if isinstance(tree, list):
  96. self.fragmentChildren = set(tree)
  97. tree = FragmentRoot(tree)
  98. else:
  99. self.fragmentChildren = set()
  100. tree = Root(tree)
  101. base.NonRecursiveTreeWalker.__init__(self, tree)
  102. self.filter = _ihatexml.InfosetFilter()
  103. def getNodeDetails(self, node):
  104. if isinstance(node, tuple): # Text node
  105. node, key = node
  106. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  107. return base.TEXT, ensure_str(getattr(node, key))
  108. elif isinstance(node, Root):
  109. return (base.DOCUMENT,)
  110. elif isinstance(node, Doctype):
  111. return base.DOCTYPE, node.name, node.public_id, node.system_id
  112. elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"):
  113. return base.TEXT, ensure_str(node.obj)
  114. elif node.tag == etree.Comment:
  115. return base.COMMENT, ensure_str(node.text)
  116. elif node.tag == etree.Entity:
  117. return base.ENTITY, ensure_str(node.text)[1:-1] # strip &;
  118. else:
  119. # This is assumed to be an ordinary element
  120. match = tag_regexp.match(ensure_str(node.tag))
  121. if match:
  122. namespace, tag = match.groups()
  123. else:
  124. namespace = None
  125. tag = ensure_str(node.tag)
  126. attrs = {}
  127. for name, value in list(node.attrib.items()):
  128. name = ensure_str(name)
  129. value = ensure_str(value)
  130. match = tag_regexp.match(name)
  131. if match:
  132. attrs[(match.group(1), match.group(2))] = value
  133. else:
  134. attrs[(None, name)] = value
  135. return (base.ELEMENT, namespace, self.filter.fromXmlName(tag),
  136. attrs, len(node) > 0 or node.text)
  137. def getFirstChild(self, node):
  138. assert not isinstance(node, tuple), "Text nodes have no children"
  139. assert len(node) or node.text, "Node has no children"
  140. if node.text:
  141. return (node, "text")
  142. else:
  143. return node[0]
  144. def getNextSibling(self, node):
  145. if isinstance(node, tuple): # Text node
  146. node, key = node
  147. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  148. if key == "text":
  149. # XXX: we cannot use a "bool(node) and node[0] or None" construct here
  150. # because node[0] might evaluate to False if it has no child element
  151. if len(node):
  152. return node[0]
  153. else:
  154. return None
  155. else: # tail
  156. return node.getnext()
  157. return (node, "tail") if node.tail else node.getnext()
  158. def getParentNode(self, node):
  159. if isinstance(node, tuple): # Text node
  160. node, key = node
  161. assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key
  162. if key == "text":
  163. return node
  164. # else: fallback to "normal" processing
  165. elif node in self.fragmentChildren:
  166. return None
  167. return node.getparent()