You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

optionaltags.py 10KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. from __future__ import absolute_import, division, unicode_literals
  2. from . import base
  3. class Filter(base.Filter):
  4. """Removes optional tags from the token stream"""
  5. def slider(self):
  6. previous1 = previous2 = None
  7. for token in self.source:
  8. if previous1 is not None:
  9. yield previous2, previous1, token
  10. previous2 = previous1
  11. previous1 = token
  12. if previous1 is not None:
  13. yield previous2, previous1, None
  14. def __iter__(self):
  15. for previous, token, next in self.slider():
  16. type = token["type"]
  17. if type == "StartTag":
  18. if (token["data"] or
  19. not self.is_optional_start(token["name"], previous, next)):
  20. yield token
  21. elif type == "EndTag":
  22. if not self.is_optional_end(token["name"], next):
  23. yield token
  24. else:
  25. yield token
  26. def is_optional_start(self, tagname, previous, next):
  27. type = next and next["type"] or None
  28. if tagname in 'html':
  29. # An html element's start tag may be omitted if the first thing
  30. # inside the html element is not a space character or a comment.
  31. return type not in ("Comment", "SpaceCharacters")
  32. elif tagname == 'head':
  33. # A head element's start tag may be omitted if the first thing
  34. # inside the head element is an element.
  35. # XXX: we also omit the start tag if the head element is empty
  36. if type in ("StartTag", "EmptyTag"):
  37. return True
  38. elif type == "EndTag":
  39. return next["name"] == "head"
  40. elif tagname == 'body':
  41. # A body element's start tag may be omitted if the first thing
  42. # inside the body element is not a space character or a comment,
  43. # except if the first thing inside the body element is a script
  44. # or style element and the node immediately preceding the body
  45. # element is a head element whose end tag has been omitted.
  46. if type in ("Comment", "SpaceCharacters"):
  47. return False
  48. elif type == "StartTag":
  49. # XXX: we do not look at the preceding event, so we never omit
  50. # the body element's start tag if it's followed by a script or
  51. # a style element.
  52. return next["name"] not in ('script', 'style')
  53. else:
  54. return True
  55. elif tagname == 'colgroup':
  56. # A colgroup element's start tag may be omitted if the first thing
  57. # inside the colgroup element is a col element, and if the element
  58. # is not immediately preceded by another colgroup element whose
  59. # end tag has been omitted.
  60. if type in ("StartTag", "EmptyTag"):
  61. # XXX: we do not look at the preceding event, so instead we never
  62. # omit the colgroup element's end tag when it is immediately
  63. # followed by another colgroup element. See is_optional_end.
  64. return next["name"] == "col"
  65. else:
  66. return False
  67. elif tagname == 'tbody':
  68. # A tbody element's start tag may be omitted if the first thing
  69. # inside the tbody element is a tr element, and if the element is
  70. # not immediately preceded by a tbody, thead, or tfoot element
  71. # whose end tag has been omitted.
  72. if type == "StartTag":
  73. # omit the thead and tfoot elements' end tag when they are
  74. # immediately followed by a tbody element. See is_optional_end.
  75. if previous and previous['type'] == 'EndTag' and \
  76. previous['name'] in ('tbody', 'thead', 'tfoot'):
  77. return False
  78. return next["name"] == 'tr'
  79. else:
  80. return False
  81. return False
  82. def is_optional_end(self, tagname, next):
  83. type = next and next["type"] or None
  84. if tagname in ('html', 'head', 'body'):
  85. # An html element's end tag may be omitted if the html element
  86. # is not immediately followed by a space character or a comment.
  87. return type not in ("Comment", "SpaceCharacters")
  88. elif tagname in ('li', 'optgroup', 'tr'):
  89. # A li element's end tag may be omitted if the li element is
  90. # immediately followed by another li element or if there is
  91. # no more content in the parent element.
  92. # An optgroup element's end tag may be omitted if the optgroup
  93. # element is immediately followed by another optgroup element,
  94. # or if there is no more content in the parent element.
  95. # A tr element's end tag may be omitted if the tr element is
  96. # immediately followed by another tr element, or if there is
  97. # no more content in the parent element.
  98. if type == "StartTag":
  99. return next["name"] == tagname
  100. else:
  101. return type == "EndTag" or type is None
  102. elif tagname in ('dt', 'dd'):
  103. # A dt element's end tag may be omitted if the dt element is
  104. # immediately followed by another dt element or a dd element.
  105. # A dd element's end tag may be omitted if the dd element is
  106. # immediately followed by another dd element or a dt element,
  107. # or if there is no more content in the parent element.
  108. if type == "StartTag":
  109. return next["name"] in ('dt', 'dd')
  110. elif tagname == 'dd':
  111. return type == "EndTag" or type is None
  112. else:
  113. return False
  114. elif tagname == 'p':
  115. # A p element's end tag may be omitted if the p element is
  116. # immediately followed by an address, article, aside,
  117. # blockquote, datagrid, dialog, dir, div, dl, fieldset,
  118. # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
  119. # nav, ol, p, pre, section, table, or ul, element, or if
  120. # there is no more content in the parent element.
  121. if type in ("StartTag", "EmptyTag"):
  122. return next["name"] in ('address', 'article', 'aside',
  123. 'blockquote', 'datagrid', 'dialog',
  124. 'dir', 'div', 'dl', 'fieldset', 'footer',
  125. 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
  126. 'header', 'hr', 'menu', 'nav', 'ol',
  127. 'p', 'pre', 'section', 'table', 'ul')
  128. else:
  129. return type == "EndTag" or type is None
  130. elif tagname == 'option':
  131. # An option element's end tag may be omitted if the option
  132. # element is immediately followed by another option element,
  133. # or if it is immediately followed by an <code>optgroup</code>
  134. # element, or if there is no more content in the parent
  135. # element.
  136. if type == "StartTag":
  137. return next["name"] in ('option', 'optgroup')
  138. else:
  139. return type == "EndTag" or type is None
  140. elif tagname in ('rt', 'rp'):
  141. # An rt element's end tag may be omitted if the rt element is
  142. # immediately followed by an rt or rp element, or if there is
  143. # no more content in the parent element.
  144. # An rp element's end tag may be omitted if the rp element is
  145. # immediately followed by an rt or rp element, or if there is
  146. # no more content in the parent element.
  147. if type == "StartTag":
  148. return next["name"] in ('rt', 'rp')
  149. else:
  150. return type == "EndTag" or type is None
  151. elif tagname == 'colgroup':
  152. # A colgroup element's end tag may be omitted if the colgroup
  153. # element is not immediately followed by a space character or
  154. # a comment.
  155. if type in ("Comment", "SpaceCharacters"):
  156. return False
  157. elif type == "StartTag":
  158. # XXX: we also look for an immediately following colgroup
  159. # element. See is_optional_start.
  160. return next["name"] != 'colgroup'
  161. else:
  162. return True
  163. elif tagname in ('thead', 'tbody'):
  164. # A thead element's end tag may be omitted if the thead element
  165. # is immediately followed by a tbody or tfoot element.
  166. # A tbody element's end tag may be omitted if the tbody element
  167. # is immediately followed by a tbody or tfoot element, or if
  168. # there is no more content in the parent element.
  169. # A tfoot element's end tag may be omitted if the tfoot element
  170. # is immediately followed by a tbody element, or if there is no
  171. # more content in the parent element.
  172. # XXX: we never omit the end tag when the following element is
  173. # a tbody. See is_optional_start.
  174. if type == "StartTag":
  175. return next["name"] in ['tbody', 'tfoot']
  176. elif tagname == 'tbody':
  177. return type == "EndTag" or type is None
  178. else:
  179. return False
  180. elif tagname == 'tfoot':
  181. # A tfoot element's end tag may be omitted if the tfoot element
  182. # is immediately followed by a tbody element, or if there is no
  183. # more content in the parent element.
  184. # XXX: we never omit the end tag when the following element is
  185. # a tbody. See is_optional_start.
  186. if type == "StartTag":
  187. return next["name"] == 'tbody'
  188. else:
  189. return type == "EndTag" or type is None
  190. elif tagname in ('td', 'th'):
  191. # A td element's end tag may be omitted if the td element is
  192. # immediately followed by a td or th element, or if there is
  193. # no more content in the parent element.
  194. # A th element's end tag may be omitted if the th element is
  195. # immediately followed by a td or th element, or if there is
  196. # no more content in the parent element.
  197. if type == "StartTag":
  198. return next["name"] in ('td', 'th')
  199. else:
  200. return type == "EndTag" or type is None
  201. return False