You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

sanitizer.py 26KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896
  1. from __future__ import absolute_import, division, unicode_literals
  2. import re
  3. from xml.sax.saxutils import escape, unescape
  4. from pip._vendor.six.moves import urllib_parse as urlparse
  5. from . import base
  6. from ..constants import namespaces, prefixes
  7. __all__ = ["Filter"]
  8. allowed_elements = frozenset((
  9. (namespaces['html'], 'a'),
  10. (namespaces['html'], 'abbr'),
  11. (namespaces['html'], 'acronym'),
  12. (namespaces['html'], 'address'),
  13. (namespaces['html'], 'area'),
  14. (namespaces['html'], 'article'),
  15. (namespaces['html'], 'aside'),
  16. (namespaces['html'], 'audio'),
  17. (namespaces['html'], 'b'),
  18. (namespaces['html'], 'big'),
  19. (namespaces['html'], 'blockquote'),
  20. (namespaces['html'], 'br'),
  21. (namespaces['html'], 'button'),
  22. (namespaces['html'], 'canvas'),
  23. (namespaces['html'], 'caption'),
  24. (namespaces['html'], 'center'),
  25. (namespaces['html'], 'cite'),
  26. (namespaces['html'], 'code'),
  27. (namespaces['html'], 'col'),
  28. (namespaces['html'], 'colgroup'),
  29. (namespaces['html'], 'command'),
  30. (namespaces['html'], 'datagrid'),
  31. (namespaces['html'], 'datalist'),
  32. (namespaces['html'], 'dd'),
  33. (namespaces['html'], 'del'),
  34. (namespaces['html'], 'details'),
  35. (namespaces['html'], 'dfn'),
  36. (namespaces['html'], 'dialog'),
  37. (namespaces['html'], 'dir'),
  38. (namespaces['html'], 'div'),
  39. (namespaces['html'], 'dl'),
  40. (namespaces['html'], 'dt'),
  41. (namespaces['html'], 'em'),
  42. (namespaces['html'], 'event-source'),
  43. (namespaces['html'], 'fieldset'),
  44. (namespaces['html'], 'figcaption'),
  45. (namespaces['html'], 'figure'),
  46. (namespaces['html'], 'footer'),
  47. (namespaces['html'], 'font'),
  48. (namespaces['html'], 'form'),
  49. (namespaces['html'], 'header'),
  50. (namespaces['html'], 'h1'),
  51. (namespaces['html'], 'h2'),
  52. (namespaces['html'], 'h3'),
  53. (namespaces['html'], 'h4'),
  54. (namespaces['html'], 'h5'),
  55. (namespaces['html'], 'h6'),
  56. (namespaces['html'], 'hr'),
  57. (namespaces['html'], 'i'),
  58. (namespaces['html'], 'img'),
  59. (namespaces['html'], 'input'),
  60. (namespaces['html'], 'ins'),
  61. (namespaces['html'], 'keygen'),
  62. (namespaces['html'], 'kbd'),
  63. (namespaces['html'], 'label'),
  64. (namespaces['html'], 'legend'),
  65. (namespaces['html'], 'li'),
  66. (namespaces['html'], 'm'),
  67. (namespaces['html'], 'map'),
  68. (namespaces['html'], 'menu'),
  69. (namespaces['html'], 'meter'),
  70. (namespaces['html'], 'multicol'),
  71. (namespaces['html'], 'nav'),
  72. (namespaces['html'], 'nextid'),
  73. (namespaces['html'], 'ol'),
  74. (namespaces['html'], 'output'),
  75. (namespaces['html'], 'optgroup'),
  76. (namespaces['html'], 'option'),
  77. (namespaces['html'], 'p'),
  78. (namespaces['html'], 'pre'),
  79. (namespaces['html'], 'progress'),
  80. (namespaces['html'], 'q'),
  81. (namespaces['html'], 's'),
  82. (namespaces['html'], 'samp'),
  83. (namespaces['html'], 'section'),
  84. (namespaces['html'], 'select'),
  85. (namespaces['html'], 'small'),
  86. (namespaces['html'], 'sound'),
  87. (namespaces['html'], 'source'),
  88. (namespaces['html'], 'spacer'),
  89. (namespaces['html'], 'span'),
  90. (namespaces['html'], 'strike'),
  91. (namespaces['html'], 'strong'),
  92. (namespaces['html'], 'sub'),
  93. (namespaces['html'], 'sup'),
  94. (namespaces['html'], 'table'),
  95. (namespaces['html'], 'tbody'),
  96. (namespaces['html'], 'td'),
  97. (namespaces['html'], 'textarea'),
  98. (namespaces['html'], 'time'),
  99. (namespaces['html'], 'tfoot'),
  100. (namespaces['html'], 'th'),
  101. (namespaces['html'], 'thead'),
  102. (namespaces['html'], 'tr'),
  103. (namespaces['html'], 'tt'),
  104. (namespaces['html'], 'u'),
  105. (namespaces['html'], 'ul'),
  106. (namespaces['html'], 'var'),
  107. (namespaces['html'], 'video'),
  108. (namespaces['mathml'], 'maction'),
  109. (namespaces['mathml'], 'math'),
  110. (namespaces['mathml'], 'merror'),
  111. (namespaces['mathml'], 'mfrac'),
  112. (namespaces['mathml'], 'mi'),
  113. (namespaces['mathml'], 'mmultiscripts'),
  114. (namespaces['mathml'], 'mn'),
  115. (namespaces['mathml'], 'mo'),
  116. (namespaces['mathml'], 'mover'),
  117. (namespaces['mathml'], 'mpadded'),
  118. (namespaces['mathml'], 'mphantom'),
  119. (namespaces['mathml'], 'mprescripts'),
  120. (namespaces['mathml'], 'mroot'),
  121. (namespaces['mathml'], 'mrow'),
  122. (namespaces['mathml'], 'mspace'),
  123. (namespaces['mathml'], 'msqrt'),
  124. (namespaces['mathml'], 'mstyle'),
  125. (namespaces['mathml'], 'msub'),
  126. (namespaces['mathml'], 'msubsup'),
  127. (namespaces['mathml'], 'msup'),
  128. (namespaces['mathml'], 'mtable'),
  129. (namespaces['mathml'], 'mtd'),
  130. (namespaces['mathml'], 'mtext'),
  131. (namespaces['mathml'], 'mtr'),
  132. (namespaces['mathml'], 'munder'),
  133. (namespaces['mathml'], 'munderover'),
  134. (namespaces['mathml'], 'none'),
  135. (namespaces['svg'], 'a'),
  136. (namespaces['svg'], 'animate'),
  137. (namespaces['svg'], 'animateColor'),
  138. (namespaces['svg'], 'animateMotion'),
  139. (namespaces['svg'], 'animateTransform'),
  140. (namespaces['svg'], 'clipPath'),
  141. (namespaces['svg'], 'circle'),
  142. (namespaces['svg'], 'defs'),
  143. (namespaces['svg'], 'desc'),
  144. (namespaces['svg'], 'ellipse'),
  145. (namespaces['svg'], 'font-face'),
  146. (namespaces['svg'], 'font-face-name'),
  147. (namespaces['svg'], 'font-face-src'),
  148. (namespaces['svg'], 'g'),
  149. (namespaces['svg'], 'glyph'),
  150. (namespaces['svg'], 'hkern'),
  151. (namespaces['svg'], 'linearGradient'),
  152. (namespaces['svg'], 'line'),
  153. (namespaces['svg'], 'marker'),
  154. (namespaces['svg'], 'metadata'),
  155. (namespaces['svg'], 'missing-glyph'),
  156. (namespaces['svg'], 'mpath'),
  157. (namespaces['svg'], 'path'),
  158. (namespaces['svg'], 'polygon'),
  159. (namespaces['svg'], 'polyline'),
  160. (namespaces['svg'], 'radialGradient'),
  161. (namespaces['svg'], 'rect'),
  162. (namespaces['svg'], 'set'),
  163. (namespaces['svg'], 'stop'),
  164. (namespaces['svg'], 'svg'),
  165. (namespaces['svg'], 'switch'),
  166. (namespaces['svg'], 'text'),
  167. (namespaces['svg'], 'title'),
  168. (namespaces['svg'], 'tspan'),
  169. (namespaces['svg'], 'use'),
  170. ))
  171. allowed_attributes = frozenset((
  172. # HTML attributes
  173. (None, 'abbr'),
  174. (None, 'accept'),
  175. (None, 'accept-charset'),
  176. (None, 'accesskey'),
  177. (None, 'action'),
  178. (None, 'align'),
  179. (None, 'alt'),
  180. (None, 'autocomplete'),
  181. (None, 'autofocus'),
  182. (None, 'axis'),
  183. (None, 'background'),
  184. (None, 'balance'),
  185. (None, 'bgcolor'),
  186. (None, 'bgproperties'),
  187. (None, 'border'),
  188. (None, 'bordercolor'),
  189. (None, 'bordercolordark'),
  190. (None, 'bordercolorlight'),
  191. (None, 'bottompadding'),
  192. (None, 'cellpadding'),
  193. (None, 'cellspacing'),
  194. (None, 'ch'),
  195. (None, 'challenge'),
  196. (None, 'char'),
  197. (None, 'charoff'),
  198. (None, 'choff'),
  199. (None, 'charset'),
  200. (None, 'checked'),
  201. (None, 'cite'),
  202. (None, 'class'),
  203. (None, 'clear'),
  204. (None, 'color'),
  205. (None, 'cols'),
  206. (None, 'colspan'),
  207. (None, 'compact'),
  208. (None, 'contenteditable'),
  209. (None, 'controls'),
  210. (None, 'coords'),
  211. (None, 'data'),
  212. (None, 'datafld'),
  213. (None, 'datapagesize'),
  214. (None, 'datasrc'),
  215. (None, 'datetime'),
  216. (None, 'default'),
  217. (None, 'delay'),
  218. (None, 'dir'),
  219. (None, 'disabled'),
  220. (None, 'draggable'),
  221. (None, 'dynsrc'),
  222. (None, 'enctype'),
  223. (None, 'end'),
  224. (None, 'face'),
  225. (None, 'for'),
  226. (None, 'form'),
  227. (None, 'frame'),
  228. (None, 'galleryimg'),
  229. (None, 'gutter'),
  230. (None, 'headers'),
  231. (None, 'height'),
  232. (None, 'hidefocus'),
  233. (None, 'hidden'),
  234. (None, 'high'),
  235. (None, 'href'),
  236. (None, 'hreflang'),
  237. (None, 'hspace'),
  238. (None, 'icon'),
  239. (None, 'id'),
  240. (None, 'inputmode'),
  241. (None, 'ismap'),
  242. (None, 'keytype'),
  243. (None, 'label'),
  244. (None, 'leftspacing'),
  245. (None, 'lang'),
  246. (None, 'list'),
  247. (None, 'longdesc'),
  248. (None, 'loop'),
  249. (None, 'loopcount'),
  250. (None, 'loopend'),
  251. (None, 'loopstart'),
  252. (None, 'low'),
  253. (None, 'lowsrc'),
  254. (None, 'max'),
  255. (None, 'maxlength'),
  256. (None, 'media'),
  257. (None, 'method'),
  258. (None, 'min'),
  259. (None, 'multiple'),
  260. (None, 'name'),
  261. (None, 'nohref'),
  262. (None, 'noshade'),
  263. (None, 'nowrap'),
  264. (None, 'open'),
  265. (None, 'optimum'),
  266. (None, 'pattern'),
  267. (None, 'ping'),
  268. (None, 'point-size'),
  269. (None, 'poster'),
  270. (None, 'pqg'),
  271. (None, 'preload'),
  272. (None, 'prompt'),
  273. (None, 'radiogroup'),
  274. (None, 'readonly'),
  275. (None, 'rel'),
  276. (None, 'repeat-max'),
  277. (None, 'repeat-min'),
  278. (None, 'replace'),
  279. (None, 'required'),
  280. (None, 'rev'),
  281. (None, 'rightspacing'),
  282. (None, 'rows'),
  283. (None, 'rowspan'),
  284. (None, 'rules'),
  285. (None, 'scope'),
  286. (None, 'selected'),
  287. (None, 'shape'),
  288. (None, 'size'),
  289. (None, 'span'),
  290. (None, 'src'),
  291. (None, 'start'),
  292. (None, 'step'),
  293. (None, 'style'),
  294. (None, 'summary'),
  295. (None, 'suppress'),
  296. (None, 'tabindex'),
  297. (None, 'target'),
  298. (None, 'template'),
  299. (None, 'title'),
  300. (None, 'toppadding'),
  301. (None, 'type'),
  302. (None, 'unselectable'),
  303. (None, 'usemap'),
  304. (None, 'urn'),
  305. (None, 'valign'),
  306. (None, 'value'),
  307. (None, 'variable'),
  308. (None, 'volume'),
  309. (None, 'vspace'),
  310. (None, 'vrml'),
  311. (None, 'width'),
  312. (None, 'wrap'),
  313. (namespaces['xml'], 'lang'),
  314. # MathML attributes
  315. (None, 'actiontype'),
  316. (None, 'align'),
  317. (None, 'columnalign'),
  318. (None, 'columnalign'),
  319. (None, 'columnalign'),
  320. (None, 'columnlines'),
  321. (None, 'columnspacing'),
  322. (None, 'columnspan'),
  323. (None, 'depth'),
  324. (None, 'display'),
  325. (None, 'displaystyle'),
  326. (None, 'equalcolumns'),
  327. (None, 'equalrows'),
  328. (None, 'fence'),
  329. (None, 'fontstyle'),
  330. (None, 'fontweight'),
  331. (None, 'frame'),
  332. (None, 'height'),
  333. (None, 'linethickness'),
  334. (None, 'lspace'),
  335. (None, 'mathbackground'),
  336. (None, 'mathcolor'),
  337. (None, 'mathvariant'),
  338. (None, 'mathvariant'),
  339. (None, 'maxsize'),
  340. (None, 'minsize'),
  341. (None, 'other'),
  342. (None, 'rowalign'),
  343. (None, 'rowalign'),
  344. (None, 'rowalign'),
  345. (None, 'rowlines'),
  346. (None, 'rowspacing'),
  347. (None, 'rowspan'),
  348. (None, 'rspace'),
  349. (None, 'scriptlevel'),
  350. (None, 'selection'),
  351. (None, 'separator'),
  352. (None, 'stretchy'),
  353. (None, 'width'),
  354. (None, 'width'),
  355. (namespaces['xlink'], 'href'),
  356. (namespaces['xlink'], 'show'),
  357. (namespaces['xlink'], 'type'),
  358. # SVG attributes
  359. (None, 'accent-height'),
  360. (None, 'accumulate'),
  361. (None, 'additive'),
  362. (None, 'alphabetic'),
  363. (None, 'arabic-form'),
  364. (None, 'ascent'),
  365. (None, 'attributeName'),
  366. (None, 'attributeType'),
  367. (None, 'baseProfile'),
  368. (None, 'bbox'),
  369. (None, 'begin'),
  370. (None, 'by'),
  371. (None, 'calcMode'),
  372. (None, 'cap-height'),
  373. (None, 'class'),
  374. (None, 'clip-path'),
  375. (None, 'color'),
  376. (None, 'color-rendering'),
  377. (None, 'content'),
  378. (None, 'cx'),
  379. (None, 'cy'),
  380. (None, 'd'),
  381. (None, 'dx'),
  382. (None, 'dy'),
  383. (None, 'descent'),
  384. (None, 'display'),
  385. (None, 'dur'),
  386. (None, 'end'),
  387. (None, 'fill'),
  388. (None, 'fill-opacity'),
  389. (None, 'fill-rule'),
  390. (None, 'font-family'),
  391. (None, 'font-size'),
  392. (None, 'font-stretch'),
  393. (None, 'font-style'),
  394. (None, 'font-variant'),
  395. (None, 'font-weight'),
  396. (None, 'from'),
  397. (None, 'fx'),
  398. (None, 'fy'),
  399. (None, 'g1'),
  400. (None, 'g2'),
  401. (None, 'glyph-name'),
  402. (None, 'gradientUnits'),
  403. (None, 'hanging'),
  404. (None, 'height'),
  405. (None, 'horiz-adv-x'),
  406. (None, 'horiz-origin-x'),
  407. (None, 'id'),
  408. (None, 'ideographic'),
  409. (None, 'k'),
  410. (None, 'keyPoints'),
  411. (None, 'keySplines'),
  412. (None, 'keyTimes'),
  413. (None, 'lang'),
  414. (None, 'marker-end'),
  415. (None, 'marker-mid'),
  416. (None, 'marker-start'),
  417. (None, 'markerHeight'),
  418. (None, 'markerUnits'),
  419. (None, 'markerWidth'),
  420. (None, 'mathematical'),
  421. (None, 'max'),
  422. (None, 'min'),
  423. (None, 'name'),
  424. (None, 'offset'),
  425. (None, 'opacity'),
  426. (None, 'orient'),
  427. (None, 'origin'),
  428. (None, 'overline-position'),
  429. (None, 'overline-thickness'),
  430. (None, 'panose-1'),
  431. (None, 'path'),
  432. (None, 'pathLength'),
  433. (None, 'points'),
  434. (None, 'preserveAspectRatio'),
  435. (None, 'r'),
  436. (None, 'refX'),
  437. (None, 'refY'),
  438. (None, 'repeatCount'),
  439. (None, 'repeatDur'),
  440. (None, 'requiredExtensions'),
  441. (None, 'requiredFeatures'),
  442. (None, 'restart'),
  443. (None, 'rotate'),
  444. (None, 'rx'),
  445. (None, 'ry'),
  446. (None, 'slope'),
  447. (None, 'stemh'),
  448. (None, 'stemv'),
  449. (None, 'stop-color'),
  450. (None, 'stop-opacity'),
  451. (None, 'strikethrough-position'),
  452. (None, 'strikethrough-thickness'),
  453. (None, 'stroke'),
  454. (None, 'stroke-dasharray'),
  455. (None, 'stroke-dashoffset'),
  456. (None, 'stroke-linecap'),
  457. (None, 'stroke-linejoin'),
  458. (None, 'stroke-miterlimit'),
  459. (None, 'stroke-opacity'),
  460. (None, 'stroke-width'),
  461. (None, 'systemLanguage'),
  462. (None, 'target'),
  463. (None, 'text-anchor'),
  464. (None, 'to'),
  465. (None, 'transform'),
  466. (None, 'type'),
  467. (None, 'u1'),
  468. (None, 'u2'),
  469. (None, 'underline-position'),
  470. (None, 'underline-thickness'),
  471. (None, 'unicode'),
  472. (None, 'unicode-range'),
  473. (None, 'units-per-em'),
  474. (None, 'values'),
  475. (None, 'version'),
  476. (None, 'viewBox'),
  477. (None, 'visibility'),
  478. (None, 'width'),
  479. (None, 'widths'),
  480. (None, 'x'),
  481. (None, 'x-height'),
  482. (None, 'x1'),
  483. (None, 'x2'),
  484. (namespaces['xlink'], 'actuate'),
  485. (namespaces['xlink'], 'arcrole'),
  486. (namespaces['xlink'], 'href'),
  487. (namespaces['xlink'], 'role'),
  488. (namespaces['xlink'], 'show'),
  489. (namespaces['xlink'], 'title'),
  490. (namespaces['xlink'], 'type'),
  491. (namespaces['xml'], 'base'),
  492. (namespaces['xml'], 'lang'),
  493. (namespaces['xml'], 'space'),
  494. (None, 'y'),
  495. (None, 'y1'),
  496. (None, 'y2'),
  497. (None, 'zoomAndPan'),
  498. ))
  499. attr_val_is_uri = frozenset((
  500. (None, 'href'),
  501. (None, 'src'),
  502. (None, 'cite'),
  503. (None, 'action'),
  504. (None, 'longdesc'),
  505. (None, 'poster'),
  506. (None, 'background'),
  507. (None, 'datasrc'),
  508. (None, 'dynsrc'),
  509. (None, 'lowsrc'),
  510. (None, 'ping'),
  511. (namespaces['xlink'], 'href'),
  512. (namespaces['xml'], 'base'),
  513. ))
  514. svg_attr_val_allows_ref = frozenset((
  515. (None, 'clip-path'),
  516. (None, 'color-profile'),
  517. (None, 'cursor'),
  518. (None, 'fill'),
  519. (None, 'filter'),
  520. (None, 'marker'),
  521. (None, 'marker-start'),
  522. (None, 'marker-mid'),
  523. (None, 'marker-end'),
  524. (None, 'mask'),
  525. (None, 'stroke'),
  526. ))
  527. svg_allow_local_href = frozenset((
  528. (None, 'altGlyph'),
  529. (None, 'animate'),
  530. (None, 'animateColor'),
  531. (None, 'animateMotion'),
  532. (None, 'animateTransform'),
  533. (None, 'cursor'),
  534. (None, 'feImage'),
  535. (None, 'filter'),
  536. (None, 'linearGradient'),
  537. (None, 'pattern'),
  538. (None, 'radialGradient'),
  539. (None, 'textpath'),
  540. (None, 'tref'),
  541. (None, 'set'),
  542. (None, 'use')
  543. ))
  544. allowed_css_properties = frozenset((
  545. 'azimuth',
  546. 'background-color',
  547. 'border-bottom-color',
  548. 'border-collapse',
  549. 'border-color',
  550. 'border-left-color',
  551. 'border-right-color',
  552. 'border-top-color',
  553. 'clear',
  554. 'color',
  555. 'cursor',
  556. 'direction',
  557. 'display',
  558. 'elevation',
  559. 'float',
  560. 'font',
  561. 'font-family',
  562. 'font-size',
  563. 'font-style',
  564. 'font-variant',
  565. 'font-weight',
  566. 'height',
  567. 'letter-spacing',
  568. 'line-height',
  569. 'overflow',
  570. 'pause',
  571. 'pause-after',
  572. 'pause-before',
  573. 'pitch',
  574. 'pitch-range',
  575. 'richness',
  576. 'speak',
  577. 'speak-header',
  578. 'speak-numeral',
  579. 'speak-punctuation',
  580. 'speech-rate',
  581. 'stress',
  582. 'text-align',
  583. 'text-decoration',
  584. 'text-indent',
  585. 'unicode-bidi',
  586. 'vertical-align',
  587. 'voice-family',
  588. 'volume',
  589. 'white-space',
  590. 'width',
  591. ))
  592. allowed_css_keywords = frozenset((
  593. 'auto',
  594. 'aqua',
  595. 'black',
  596. 'block',
  597. 'blue',
  598. 'bold',
  599. 'both',
  600. 'bottom',
  601. 'brown',
  602. 'center',
  603. 'collapse',
  604. 'dashed',
  605. 'dotted',
  606. 'fuchsia',
  607. 'gray',
  608. 'green',
  609. '!important',
  610. 'italic',
  611. 'left',
  612. 'lime',
  613. 'maroon',
  614. 'medium',
  615. 'none',
  616. 'navy',
  617. 'normal',
  618. 'nowrap',
  619. 'olive',
  620. 'pointer',
  621. 'purple',
  622. 'red',
  623. 'right',
  624. 'solid',
  625. 'silver',
  626. 'teal',
  627. 'top',
  628. 'transparent',
  629. 'underline',
  630. 'white',
  631. 'yellow',
  632. ))
  633. allowed_svg_properties = frozenset((
  634. 'fill',
  635. 'fill-opacity',
  636. 'fill-rule',
  637. 'stroke',
  638. 'stroke-width',
  639. 'stroke-linecap',
  640. 'stroke-linejoin',
  641. 'stroke-opacity',
  642. ))
  643. allowed_protocols = frozenset((
  644. 'ed2k',
  645. 'ftp',
  646. 'http',
  647. 'https',
  648. 'irc',
  649. 'mailto',
  650. 'news',
  651. 'gopher',
  652. 'nntp',
  653. 'telnet',
  654. 'webcal',
  655. 'xmpp',
  656. 'callto',
  657. 'feed',
  658. 'urn',
  659. 'aim',
  660. 'rsync',
  661. 'tag',
  662. 'ssh',
  663. 'sftp',
  664. 'rtsp',
  665. 'afs',
  666. 'data',
  667. ))
  668. allowed_content_types = frozenset((
  669. 'image/png',
  670. 'image/jpeg',
  671. 'image/gif',
  672. 'image/webp',
  673. 'image/bmp',
  674. 'text/plain',
  675. ))
  676. data_content_type = re.compile(r'''
  677. ^
  678. # Match a content type <application>/<type>
  679. (?P<content_type>[-a-zA-Z0-9.]+/[-a-zA-Z0-9.]+)
  680. # Match any character set and encoding
  681. (?:(?:;charset=(?:[-a-zA-Z0-9]+)(?:;(?:base64))?)
  682. |(?:;(?:base64))?(?:;charset=(?:[-a-zA-Z0-9]+))?)
  683. # Assume the rest is data
  684. ,.*
  685. $
  686. ''',
  687. re.VERBOSE)
  688. class Filter(base.Filter):
  689. """Sanitizes token stream of XHTML+MathML+SVG and of inline style attributes"""
  690. def __init__(self,
  691. source,
  692. allowed_elements=allowed_elements,
  693. allowed_attributes=allowed_attributes,
  694. allowed_css_properties=allowed_css_properties,
  695. allowed_css_keywords=allowed_css_keywords,
  696. allowed_svg_properties=allowed_svg_properties,
  697. allowed_protocols=allowed_protocols,
  698. allowed_content_types=allowed_content_types,
  699. attr_val_is_uri=attr_val_is_uri,
  700. svg_attr_val_allows_ref=svg_attr_val_allows_ref,
  701. svg_allow_local_href=svg_allow_local_href):
  702. """Creates a Filter
  703. :arg allowed_elements: set of elements to allow--everything else will
  704. be escaped
  705. :arg allowed_attributes: set of attributes to allow in
  706. elements--everything else will be stripped
  707. :arg allowed_css_properties: set of CSS properties to allow--everything
  708. else will be stripped
  709. :arg allowed_css_keywords: set of CSS keywords to allow--everything
  710. else will be stripped
  711. :arg allowed_svg_properties: set of SVG properties to allow--everything
  712. else will be removed
  713. :arg allowed_protocols: set of allowed protocols for URIs
  714. :arg allowed_content_types: set of allowed content types for ``data`` URIs.
  715. :arg attr_val_is_uri: set of attributes that have URI values--values
  716. that have a scheme not listed in ``allowed_protocols`` are removed
  717. :arg svg_attr_val_allows_ref: set of SVG attributes that can have
  718. references
  719. :arg svg_allow_local_href: set of SVG elements that can have local
  720. hrefs--these are removed
  721. """
  722. super(Filter, self).__init__(source)
  723. self.allowed_elements = allowed_elements
  724. self.allowed_attributes = allowed_attributes
  725. self.allowed_css_properties = allowed_css_properties
  726. self.allowed_css_keywords = allowed_css_keywords
  727. self.allowed_svg_properties = allowed_svg_properties
  728. self.allowed_protocols = allowed_protocols
  729. self.allowed_content_types = allowed_content_types
  730. self.attr_val_is_uri = attr_val_is_uri
  731. self.svg_attr_val_allows_ref = svg_attr_val_allows_ref
  732. self.svg_allow_local_href = svg_allow_local_href
  733. def __iter__(self):
  734. for token in base.Filter.__iter__(self):
  735. token = self.sanitize_token(token)
  736. if token:
  737. yield token
  738. # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
  739. # stripping out all attributes not in ALLOWED_ATTRIBUTES. Style attributes
  740. # are parsed, and a restricted set, specified by ALLOWED_CSS_PROPERTIES and
  741. # ALLOWED_CSS_KEYWORDS, are allowed through. attributes in ATTR_VAL_IS_URI
  742. # are scanned, and only URI schemes specified in ALLOWED_PROTOCOLS are
  743. # allowed.
  744. #
  745. # sanitize_html('<script> do_nasty_stuff() </script>')
  746. # => &lt;script> do_nasty_stuff() &lt;/script>
  747. # sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
  748. # => <a>Click here for $100</a>
  749. def sanitize_token(self, token):
  750. # accommodate filters which use token_type differently
  751. token_type = token["type"]
  752. if token_type in ("StartTag", "EndTag", "EmptyTag"):
  753. name = token["name"]
  754. namespace = token["namespace"]
  755. if ((namespace, name) in self.allowed_elements or
  756. (namespace is None and
  757. (namespaces["html"], name) in self.allowed_elements)):
  758. return self.allowed_token(token)
  759. else:
  760. return self.disallowed_token(token)
  761. elif token_type == "Comment":
  762. pass
  763. else:
  764. return token
  765. def allowed_token(self, token):
  766. if "data" in token:
  767. attrs = token["data"]
  768. attr_names = set(attrs.keys())
  769. # Remove forbidden attributes
  770. for to_remove in (attr_names - self.allowed_attributes):
  771. del token["data"][to_remove]
  772. attr_names.remove(to_remove)
  773. # Remove attributes with disallowed URL values
  774. for attr in (attr_names & self.attr_val_is_uri):
  775. assert attr in attrs
  776. # I don't have a clue where this regexp comes from or why it matches those
  777. # characters, nor why we call unescape. I just know it's always been here.
  778. # Should you be worried by this comment in a sanitizer? Yes. On the other hand, all
  779. # this will do is remove *more* than it otherwise would.
  780. val_unescaped = re.sub("[`\x00-\x20\x7f-\xa0\\s]+", '',
  781. unescape(attrs[attr])).lower()
  782. # remove replacement characters from unescaped characters
  783. val_unescaped = val_unescaped.replace("\ufffd", "")
  784. try:
  785. uri = urlparse.urlparse(val_unescaped)
  786. except ValueError:
  787. uri = None
  788. del attrs[attr]
  789. if uri and uri.scheme:
  790. if uri.scheme not in self.allowed_protocols:
  791. del attrs[attr]
  792. if uri.scheme == 'data':
  793. m = data_content_type.match(uri.path)
  794. if not m:
  795. del attrs[attr]
  796. elif m.group('content_type') not in self.allowed_content_types:
  797. del attrs[attr]
  798. for attr in self.svg_attr_val_allows_ref:
  799. if attr in attrs:
  800. attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
  801. ' ',
  802. unescape(attrs[attr]))
  803. if (token["name"] in self.svg_allow_local_href and
  804. (namespaces['xlink'], 'href') in attrs and re.search(r'^\s*[^#\s].*',
  805. attrs[(namespaces['xlink'], 'href')])):
  806. del attrs[(namespaces['xlink'], 'href')]
  807. if (None, 'style') in attrs:
  808. attrs[(None, 'style')] = self.sanitize_css(attrs[(None, 'style')])
  809. token["data"] = attrs
  810. return token
  811. def disallowed_token(self, token):
  812. token_type = token["type"]
  813. if token_type == "EndTag":
  814. token["data"] = "</%s>" % token["name"]
  815. elif token["data"]:
  816. assert token_type in ("StartTag", "EmptyTag")
  817. attrs = []
  818. for (ns, name), v in token["data"].items():
  819. attrs.append(' %s="%s"' % (name if ns is None else "%s:%s" % (prefixes[ns], name), escape(v)))
  820. token["data"] = "<%s%s>" % (token["name"], ''.join(attrs))
  821. else:
  822. token["data"] = "<%s>" % token["name"]
  823. if token.get("selfClosing"):
  824. token["data"] = token["data"][:-1] + "/>"
  825. token["type"] = "Characters"
  826. del token["name"]
  827. return token
  828. def sanitize_css(self, style):
  829. # disallow urls
  830. style = re.compile(r'url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ', style)
  831. # gauntlet
  832. if not re.match(r"""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style):
  833. return ''
  834. if not re.match(r"^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style):
  835. return ''
  836. clean = []
  837. for prop, value in re.findall(r"([-\w]+)\s*:\s*([^:;]*)", style):
  838. if not value:
  839. continue
  840. if prop.lower() in self.allowed_css_properties:
  841. clean.append(prop + ': ' + value + ';')
  842. elif prop.split('-')[0].lower() in ['background', 'border', 'margin',
  843. 'padding']:
  844. for keyword in value.split():
  845. if keyword not in self.allowed_css_keywords and \
  846. not re.match(r"^(#[0-9a-fA-F]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$", keyword): # noqa
  847. break
  848. else:
  849. clean.append(prop + ': ' + value + ';')
  850. elif prop.lower() in self.allowed_svg_properties:
  851. clean.append(prop + ': ' + value + ';')
  852. return ' '.join(clean)