|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292 |
- """Compare two HTML documents."""
- import html
- from html.parser import HTMLParser
-
- from django.utils.regex_helper import _lazy_re_compile
-
- # ASCII whitespace is U+0009 TAB, U+000A LF, U+000C FF, U+000D CR, or U+0020
- # SPACE.
- # https://infra.spec.whatwg.org/#ascii-whitespace
- ASCII_WHITESPACE = _lazy_re_compile(r"[\t\n\f\r ]+")
-
- # https://html.spec.whatwg.org/#attributes-3
- BOOLEAN_ATTRIBUTES = {
- "allowfullscreen",
- "async",
- "autofocus",
- "autoplay",
- "checked",
- "controls",
- "default",
- "defer ",
- "disabled",
- "formnovalidate",
- "hidden",
- "ismap",
- "itemscope",
- "loop",
- "multiple",
- "muted",
- "nomodule",
- "novalidate",
- "open",
- "playsinline",
- "readonly",
- "required",
- "reversed",
- "selected",
- # Attributes for deprecated tags.
- "truespeed",
- }
-
-
- def normalize_whitespace(string):
- return ASCII_WHITESPACE.sub(" ", string)
-
-
- def normalize_attributes(attributes):
- normalized = []
- for name, value in attributes:
- if name == "class" and value:
- # Special case handling of 'class' attribute, so that comparisons
- # of DOM instances are not sensitive to ordering of classes.
- value = " ".join(
- sorted(value for value in ASCII_WHITESPACE.split(value) if value)
- )
- # Boolean attributes without a value is same as attribute with value
- # that equals the attributes name. For example:
- # <input checked> == <input checked="checked">
- if name in BOOLEAN_ATTRIBUTES:
- if not value or value == name:
- value = None
- elif value is None:
- value = ""
- normalized.append((name, value))
- return normalized
-
-
- class Element:
- def __init__(self, name, attributes):
- self.name = name
- self.attributes = sorted(attributes)
- self.children = []
-
- def append(self, element):
- if isinstance(element, str):
- element = normalize_whitespace(element)
- if self.children and isinstance(self.children[-1], str):
- self.children[-1] += element
- self.children[-1] = normalize_whitespace(self.children[-1])
- return
- elif self.children:
- # removing last children if it is only whitespace
- # this can result in incorrect dom representations since
- # whitespace between inline tags like <span> is significant
- if isinstance(self.children[-1], str) and self.children[-1].isspace():
- self.children.pop()
- if element:
- self.children.append(element)
-
- def finalize(self):
- def rstrip_last_element(children):
- if children and isinstance(children[-1], str):
- children[-1] = children[-1].rstrip()
- if not children[-1]:
- children.pop()
- children = rstrip_last_element(children)
- return children
-
- rstrip_last_element(self.children)
- for i, child in enumerate(self.children):
- if isinstance(child, str):
- self.children[i] = child.strip()
- elif hasattr(child, "finalize"):
- child.finalize()
-
- def __eq__(self, element):
- if not hasattr(element, "name") or self.name != element.name:
- return False
- if self.attributes != element.attributes:
- return False
- return self.children == element.children
-
- def __hash__(self):
- return hash((self.name, *self.attributes))
-
- def _count(self, element, count=True):
- if not isinstance(element, str) and self == element:
- return 1
- if isinstance(element, RootElement) and self.children == element.children:
- return 1
- i = 0
- elem_child_idx = 0
- for child in self.children:
- # child is text content and element is also text content, then
- # make a simple "text" in "text"
- if isinstance(child, str):
- if isinstance(element, str):
- if count:
- i += child.count(element)
- elif element in child:
- return 1
- else:
- # Look for element wholly within this child.
- i += child._count(element, count=count)
- if not count and i:
- return i
- # Also look for a sequence of element's children among self's
- # children. self.children == element.children is tested above,
- # but will fail if self has additional children. Ex: '<a/><b/>'
- # is contained in '<a/><b/><c/>'.
- if isinstance(element, RootElement) and element.children:
- elem_child = element.children[elem_child_idx]
- # Start or continue match, advance index.
- if elem_child == child:
- elem_child_idx += 1
- # Match found, reset index.
- if elem_child_idx == len(element.children):
- i += 1
- elem_child_idx = 0
- # No match, reset index.
- else:
- elem_child_idx = 0
- return i
-
- def __contains__(self, element):
- return self._count(element, count=False) > 0
-
- def count(self, element):
- return self._count(element, count=True)
-
- def __getitem__(self, key):
- return self.children[key]
-
- def __str__(self):
- output = "<%s" % self.name
- for key, value in self.attributes:
- if value is not None:
- output += ' %s="%s"' % (key, value)
- else:
- output += " %s" % key
- if self.children:
- output += ">\n"
- output += "".join(
- [
- html.escape(c) if isinstance(c, str) else str(c)
- for c in self.children
- ]
- )
- output += "\n</%s>" % self.name
- else:
- output += ">"
- return output
-
- def __repr__(self):
- return str(self)
-
-
- class RootElement(Element):
- def __init__(self):
- super().__init__(None, ())
-
- def __str__(self):
- return "".join(
- [html.escape(c) if isinstance(c, str) else str(c) for c in self.children]
- )
-
-
- class HTMLParseError(Exception):
- pass
-
-
- class Parser(HTMLParser):
- # https://html.spec.whatwg.org/#void-elements
- SELF_CLOSING_TAGS = {
- "area",
- "base",
- "br",
- "col",
- "embed",
- "hr",
- "img",
- "input",
- "link",
- "meta",
- "param",
- "source",
- "track",
- "wbr",
- # Deprecated tags
- "frame",
- "spacer",
- }
-
- def __init__(self):
- super().__init__()
- self.root = RootElement()
- self.open_tags = []
- self.element_positions = {}
-
- def error(self, msg):
- raise HTMLParseError(msg, self.getpos())
-
- def format_position(self, position=None, element=None):
- if not position and element:
- position = self.element_positions[element]
- if position is None:
- position = self.getpos()
- if hasattr(position, "lineno"):
- position = position.lineno, position.offset
- return "Line %d, Column %d" % position
-
- @property
- def current(self):
- if self.open_tags:
- return self.open_tags[-1]
- else:
- return self.root
-
- def handle_startendtag(self, tag, attrs):
- self.handle_starttag(tag, attrs)
- if tag not in self.SELF_CLOSING_TAGS:
- self.handle_endtag(tag)
-
- def handle_starttag(self, tag, attrs):
- attrs = normalize_attributes(attrs)
- element = Element(tag, attrs)
- self.current.append(element)
- if tag not in self.SELF_CLOSING_TAGS:
- self.open_tags.append(element)
- self.element_positions[element] = self.getpos()
-
- def handle_endtag(self, tag):
- if not self.open_tags:
- self.error("Unexpected end tag `%s` (%s)" % (tag, self.format_position()))
- element = self.open_tags.pop()
- while element.name != tag:
- if not self.open_tags:
- self.error(
- "Unexpected end tag `%s` (%s)" % (tag, self.format_position())
- )
- element = self.open_tags.pop()
-
- def handle_data(self, data):
- self.current.append(data)
-
-
- def parse_html(html):
- """
- Take a string that contains HTML and turn it into a Python object structure
- that can be easily compared against other HTML on semantic equivalence.
- Syntactical differences like which quotation is used on arguments will be
- ignored.
- """
- parser = Parser()
- parser.feed(html)
- parser.close()
- document = parser.root
- document.finalize()
- # Removing ROOT element if it's not necessary
- if len(document.children) == 1 and not isinstance(document.children[0], str):
- document = document.children[0]
- return document
|