You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

__init__.py 5.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
  1. """A collection of modules for iterating through different kinds of
  2. tree, generating tokens identical to those produced by the tokenizer
  3. module.
  4. To create a tree walker for a new type of tree, you need to do
  5. implement a tree walker object (called TreeWalker by convention) that
  6. implements a 'serialize' method taking a tree as sole argument and
  7. returning an iterator generating tokens.
  8. """
  9. from __future__ import absolute_import, division, unicode_literals
  10. from .. import constants
  11. from .._utils import default_etree
  12. __all__ = ["getTreeWalker", "pprint"]
  13. treeWalkerCache = {}
  14. def getTreeWalker(treeType, implementation=None, **kwargs):
  15. """Get a TreeWalker class for various types of tree with built-in support
  16. :arg str treeType: the name of the tree type required (case-insensitive).
  17. Supported values are:
  18. * "dom": The xml.dom.minidom DOM implementation
  19. * "etree": A generic walker for tree implementations exposing an
  20. elementtree-like interface (known to work with ElementTree,
  21. cElementTree and lxml.etree).
  22. * "lxml": Optimized walker for lxml.etree
  23. * "genshi": a Genshi stream
  24. :arg implementation: A module implementing the tree type e.g.
  25. xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
  26. tree type only).
  27. :arg kwargs: keyword arguments passed to the etree walker--for other
  28. walkers, this has no effect
  29. :returns: a TreeWalker class
  30. """
  31. treeType = treeType.lower()
  32. if treeType not in treeWalkerCache:
  33. if treeType == "dom":
  34. from . import dom
  35. treeWalkerCache[treeType] = dom.TreeWalker
  36. elif treeType == "genshi":
  37. from . import genshi
  38. treeWalkerCache[treeType] = genshi.TreeWalker
  39. elif treeType == "lxml":
  40. from . import etree_lxml
  41. treeWalkerCache[treeType] = etree_lxml.TreeWalker
  42. elif treeType == "etree":
  43. from . import etree
  44. if implementation is None:
  45. implementation = default_etree
  46. # XXX: NEVER cache here, caching is done in the etree submodule
  47. return etree.getETreeModule(implementation, **kwargs).TreeWalker
  48. return treeWalkerCache.get(treeType)
  49. def concatenateCharacterTokens(tokens):
  50. pendingCharacters = []
  51. for token in tokens:
  52. type = token["type"]
  53. if type in ("Characters", "SpaceCharacters"):
  54. pendingCharacters.append(token["data"])
  55. else:
  56. if pendingCharacters:
  57. yield {"type": "Characters", "data": "".join(pendingCharacters)}
  58. pendingCharacters = []
  59. yield token
  60. if pendingCharacters:
  61. yield {"type": "Characters", "data": "".join(pendingCharacters)}
  62. def pprint(walker):
  63. """Pretty printer for tree walkers
  64. Takes a TreeWalker instance and pretty prints the output of walking the tree.
  65. :arg walker: a TreeWalker instance
  66. """
  67. output = []
  68. indent = 0
  69. for token in concatenateCharacterTokens(walker):
  70. type = token["type"]
  71. if type in ("StartTag", "EmptyTag"):
  72. # tag name
  73. if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
  74. if token["namespace"] in constants.prefixes:
  75. ns = constants.prefixes[token["namespace"]]
  76. else:
  77. ns = token["namespace"]
  78. name = "%s %s" % (ns, token["name"])
  79. else:
  80. name = token["name"]
  81. output.append("%s<%s>" % (" " * indent, name))
  82. indent += 2
  83. # attributes (sorted for consistent ordering)
  84. attrs = token["data"]
  85. for (namespace, localname), value in sorted(attrs.items()):
  86. if namespace:
  87. if namespace in constants.prefixes:
  88. ns = constants.prefixes[namespace]
  89. else:
  90. ns = namespace
  91. name = "%s %s" % (ns, localname)
  92. else:
  93. name = localname
  94. output.append("%s%s=\"%s\"" % (" " * indent, name, value))
  95. # self-closing
  96. if type == "EmptyTag":
  97. indent -= 2
  98. elif type == "EndTag":
  99. indent -= 2
  100. elif type == "Comment":
  101. output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
  102. elif type == "Doctype":
  103. if token["name"]:
  104. if token["publicId"]:
  105. output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
  106. (" " * indent,
  107. token["name"],
  108. token["publicId"],
  109. token["systemId"] if token["systemId"] else ""))
  110. elif token["systemId"]:
  111. output.append("""%s<!DOCTYPE %s "" "%s">""" %
  112. (" " * indent,
  113. token["name"],
  114. token["systemId"]))
  115. else:
  116. output.append("%s<!DOCTYPE %s>" % (" " * indent,
  117. token["name"]))
  118. else:
  119. output.append("%s<!DOCTYPE >" % (" " * indent,))
  120. elif type == "Characters":
  121. output.append("%s\"%s\"" % (" " * indent, token["data"]))
  122. elif type == "SpaceCharacters":
  123. assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
  124. else:
  125. raise ValueError("Unknown token type, %s" % type)
  126. return "\n".join(output)