You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_utf8validator.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. # coding=utf-8
  2. ###############################################################################
  3. #
  4. # The MIT License (MIT)
  5. #
  6. # Copyright (c) Crossbar.io Technologies GmbH
  7. #
  8. # Permission is hereby granted, free of charge, to any person obtaining a copy
  9. # of this software and associated documentation files (the "Software"), to deal
  10. # in the Software without restriction, including without limitation the rights
  11. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. # copies of the Software, and to permit persons to whom the Software is
  13. # furnished to do so, subject to the following conditions:
  14. #
  15. # The above copyright notice and this permission notice shall be included in
  16. # all copies or substantial portions of the Software.
  17. #
  18. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. # THE SOFTWARE.
  25. #
  26. ###############################################################################
  27. from __future__ import absolute_import
  28. import six
  29. import struct
  30. import unittest
  31. from autobahn.websocket.utf8validator import Utf8Validator as StandardUtf8Validator
  32. try:
  33. from _nvx_utf8validator import lib # noqa
  34. from autobahn.nvx import Utf8Validator as NvxUtf8Validator
  35. except ImportError:
  36. HAS_NVX = False
  37. else:
  38. HAS_NVX = True
  39. def _create_utf8_test_sequences():
  40. """
  41. Create test sequences for UTF-8 decoder tests from
  42. http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  43. """
  44. UTF8_TEST_SEQUENCES = []
  45. # 1 Some correct UTF-8 text
  46. vss = b'\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5'
  47. vs = [b"Some valid UTF-8 sequences", []]
  48. vs[1].append((True, b'hello\x24world')) # U+0024
  49. vs[1].append((True, b'hello\xC2\xA2world')) # U+00A2
  50. vs[1].append((True, b'hello\xE2\x82\xACworld')) # U+20AC
  51. vs[1].append((True, b'hello\xF0\xA4\xAD\xA2world')) # U+24B62
  52. vs[1].append((True, vss))
  53. UTF8_TEST_SEQUENCES.append(vs)
  54. # All prefixes of correct UTF-8 text
  55. vs = [
  56. b"All prefixes of a valid UTF-8 string that contains multi-byte code points",
  57. []]
  58. v = StandardUtf8Validator()
  59. for i in range(1, len(vss) + 1):
  60. v.reset()
  61. res = v.validate(vss[:i])
  62. vs[1].append((res[0] and res[1], vss[:i]))
  63. UTF8_TEST_SEQUENCES.append(vs)
  64. # 2.1 First possible sequence of a certain length
  65. vs = [b"First possible sequence of a certain length", []]
  66. vs[1].append((True, b'\x00'))
  67. vs[1].append((True, b'\xc2\x80'))
  68. vs[1].append((True, b'\xe0\xa0\x80'))
  69. vs[1].append((True, b'\xf0\x90\x80\x80'))
  70. UTF8_TEST_SEQUENCES.append(vs)
  71. # the following conform to the UTF-8 integer encoding scheme, but
  72. # valid UTF-8 only allows for Unicode code points up to U+10FFFF
  73. vs = [b"First possible sequence length 5/6 (invalid codepoints)", []]
  74. vs[1].append((False, b'\xf8\x88\x80\x80\x80'))
  75. vs[1].append((False, b'\xfc\x84\x80\x80\x80\x80'))
  76. UTF8_TEST_SEQUENCES.append(vs)
  77. # 2.2 Last possible sequence of a certain length
  78. vs = [b"Last possible sequence of a certain length", []]
  79. vs[1].append((True, b'\x7f'))
  80. vs[1].append((True, b'\xdf\xbf'))
  81. vs[1].append((True, b'\xef\xbf\xbf'))
  82. vs[1].append((True, b'\xf4\x8f\xbf\xbf'))
  83. UTF8_TEST_SEQUENCES.append(vs)
  84. # the following conform to the UTF-8 integer encoding scheme, but
  85. # valid UTF-8 only allows for Unicode code points up to U+10FFFF
  86. vs = [b"Last possible sequence length 4/5/6 (invalid codepoints)", []]
  87. vs[1].append((False, b'\xf7\xbf\xbf\xbf'))
  88. vs[1].append((False, b'\xfb\xbf\xbf\xbf\xbf'))
  89. vs[1].append((False, b'\xfd\xbf\xbf\xbf\xbf\xbf'))
  90. UTF8_TEST_SEQUENCES.append(vs)
  91. # 2.3 Other boundary conditions
  92. vs = [b"Other boundary conditions", []]
  93. vs[1].append((True, b'\xed\x9f\xbf'))
  94. vs[1].append((True, b'\xee\x80\x80'))
  95. vs[1].append((True, b'\xef\xbf\xbd'))
  96. vs[1].append((True, b'\xf4\x8f\xbf\xbf'))
  97. vs[1].append((False, b'\xf4\x90\x80\x80'))
  98. UTF8_TEST_SEQUENCES.append(vs)
  99. # 3.1 Unexpected continuation bytes
  100. vs = [b"Unexpected continuation bytes", []]
  101. vs[1].append((False, b'\x80'))
  102. vs[1].append((False, b'\xbf'))
  103. vs[1].append((False, b'\x80\xbf'))
  104. vs[1].append((False, b'\x80\xbf\x80'))
  105. vs[1].append((False, b'\x80\xbf\x80\xbf'))
  106. vs[1].append((False, b'\x80\xbf\x80\xbf\x80'))
  107. vs[1].append((False, b'\x80\xbf\x80\xbf\x80\xbf'))
  108. s = b''
  109. # 3.2 Lonely start characters
  110. vs = [b"Lonely start characters", []]
  111. m = [(0xc0, 0xdf), (0xe0, 0xef), (0xf0, 0xf7), (0xf8, 0xfb), (0xfc, 0xfd)]
  112. for mm in m:
  113. s = b''
  114. for i in range(mm[0], mm[1]):
  115. s += struct.pack('BB', i, 0x20)
  116. # s += chr(i)
  117. # s += chr(0x20)
  118. vs[1].append((False, s))
  119. UTF8_TEST_SEQUENCES.append(vs)
  120. # 3.3 Sequences with last continuation byte missing
  121. vs = [b"Sequences with last continuation byte missing", []]
  122. k = [b'\xc0', b'\xe0\x80', b'\xf0\x80\x80', b'\xf8\x80\x80\x80', b'\xfc\x80\x80\x80\x80',
  123. b'\xdf', b'\xef\xbf', b'\xf7\xbf\xbf', b'\xfb\xbf\xbf\xbf', b'\xfd\xbf\xbf\xbf\xbf']
  124. for kk in k:
  125. vs[1].append((False, kk))
  126. UTF8_TEST_SEQUENCES.append(vs)
  127. # 3.4 Concatenation of incomplete sequences
  128. vs = [b"Concatenation of incomplete sequences", []]
  129. vs[1].append((False, b''.join(k)))
  130. UTF8_TEST_SEQUENCES.append(vs)
  131. # 3.5 Impossible bytes
  132. vs = [b"Impossible bytes", []]
  133. vs[1].append((False, b'\xfe'))
  134. vs[1].append((False, b'\xff'))
  135. vs[1].append((False, b'\xfe\xfe\xff\xff'))
  136. UTF8_TEST_SEQUENCES.append(vs)
  137. # 4.1 Examples of an overlong ASCII character
  138. vs = [b"Examples of an overlong ASCII character", []]
  139. vs[1].append((False, b'\xc0\xaf'))
  140. vs[1].append((False, b'\xe0\x80\xaf'))
  141. vs[1].append((False, b'\xf0\x80\x80\xaf'))
  142. vs[1].append((False, b'\xf8\x80\x80\x80\xaf'))
  143. vs[1].append((False, b'\xfc\x80\x80\x80\x80\xaf'))
  144. UTF8_TEST_SEQUENCES.append(vs)
  145. # 4.2 Maximum overlong sequences
  146. vs = [b"Maximum overlong sequences", []]
  147. vs[1].append((False, b'\xc1\xbf'))
  148. vs[1].append((False, b'\xe0\x9f\xbf'))
  149. vs[1].append((False, b'\xf0\x8f\xbf\xbf'))
  150. vs[1].append((False, b'\xf8\x87\xbf\xbf\xbf'))
  151. vs[1].append((False, b'\xfc\x83\xbf\xbf\xbf\xbf'))
  152. UTF8_TEST_SEQUENCES.append(vs)
  153. # 4.3 Overlong representation of the NUL character
  154. vs = [b"Overlong representation of the NUL character", []]
  155. vs[1].append((False, b'\xc0\x80'))
  156. vs[1].append((False, b'\xe0\x80\x80'))
  157. vs[1].append((False, b'\xf0\x80\x80\x80'))
  158. vs[1].append((False, b'\xf8\x80\x80\x80\x80'))
  159. vs[1].append((False, b'\xfc\x80\x80\x80\x80\x80'))
  160. UTF8_TEST_SEQUENCES.append(vs)
  161. # 5.1 Single UTF-16 surrogates
  162. vs = [b"Single UTF-16 surrogates", []]
  163. vs[1].append((False, b'\xed\xa0\x80'))
  164. vs[1].append((False, b'\xed\xad\xbf'))
  165. vs[1].append((False, b'\xed\xae\x80'))
  166. vs[1].append((False, b'\xed\xaf\xbf'))
  167. vs[1].append((False, b'\xed\xb0\x80'))
  168. vs[1].append((False, b'\xed\xbe\x80'))
  169. vs[1].append((False, b'\xed\xbf\xbf'))
  170. UTF8_TEST_SEQUENCES.append(vs)
  171. # 5.2 Paired UTF-16 surrogates
  172. vs = [b"Paired UTF-16 surrogates", []]
  173. vs[1].append((False, b'\xed\xa0\x80\xed\xb0\x80'))
  174. vs[1].append((False, b'\xed\xa0\x80\xed\xbf\xbf'))
  175. vs[1].append((False, b'\xed\xad\xbf\xed\xb0\x80'))
  176. vs[1].append((False, b'\xed\xad\xbf\xed\xbf\xbf'))
  177. vs[1].append((False, b'\xed\xae\x80\xed\xb0\x80'))
  178. vs[1].append((False, b'\xed\xae\x80\xed\xbf\xbf'))
  179. vs[1].append((False, b'\xed\xaf\xbf\xed\xb0\x80'))
  180. vs[1].append((False, b'\xed\xaf\xbf\xed\xbf\xbf'))
  181. UTF8_TEST_SEQUENCES.append(vs)
  182. # 5.3 Other illegal code positions
  183. # Those are non-character code points and valid UTF-8 by RFC 3629
  184. vs = [b"Non-character code points (valid UTF-8)", []]
  185. # https://bug686312.bugzilla.mozilla.org/attachment.cgi?id=561257
  186. # non-characters: EF BF [BE-BF]
  187. vs[1].append((True, b'\xef\xbf\xbe'))
  188. vs[1].append((True, b'\xef\xbf\xbf'))
  189. # non-characters: F[0-7] [89AB]F BF [BE-BF]
  190. for z1 in [b'\xf0', b'\xf1', b'\xf2', b'\xf3', b'\xf4']:
  191. for z2 in [b'\x8f', b'\x9f', b'\xaf', b'\xbf']:
  192. # those encode codepoints >U+10FFFF
  193. if not (z1 == b'\xf4' and z2 != b'\x8f'):
  194. for z3 in [b'\xbe', b'\xbf']:
  195. zz = z1 + z2 + b'\xbf' + z3
  196. if zz not in [b'\xf0\x8f\xbf\xbe',
  197. b'\xf0\x8f\xbf\xbf']: # filter overlong sequences
  198. vs[1].append((True, zz))
  199. UTF8_TEST_SEQUENCES.append(vs)
  200. # Unicode "specials", such as replacement char etc
  201. # http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29
  202. vs = [b"Unicode specials (i.e. replacement char)", []]
  203. vs[1].append((True, b'\xef\xbf\xb9'))
  204. vs[1].append((True, b'\xef\xbf\xba'))
  205. vs[1].append((True, b'\xef\xbf\xbb'))
  206. vs[1].append((True, b'\xef\xbf\xbc'))
  207. vs[1].append((True, b'\xef\xbf\xbd')) # replacement char
  208. vs[1].append((True, b'\xef\xbf\xbe'))
  209. vs[1].append((True, b'\xef\xbf\xbf'))
  210. UTF8_TEST_SEQUENCES.append(vs)
  211. return UTF8_TEST_SEQUENCES
  212. def _create_valid_utf8_test_sequences():
  213. """
  214. Generate some exotic, but valid UTF8 test strings.
  215. """
  216. VALID_UTF8_TEST_SEQUENCES = []
  217. for test in _create_utf8_test_sequences():
  218. valids = [x[1] for x in test[1] if x[0]]
  219. if len(valids) > 0:
  220. VALID_UTF8_TEST_SEQUENCES.append([test[0], valids])
  221. return VALID_UTF8_TEST_SEQUENCES
  222. @unittest.skipIf(not HAS_NVX, 'NVX native extensions not present')
  223. class TestNvxUtf8Validator(unittest.TestCase):
  224. def setUp(self):
  225. # These tests verify the UTF-8 decoder/validator on the various test cases from
  226. # http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
  227. vs = []
  228. for k in _create_utf8_test_sequences():
  229. vs.extend(k[1])
  230. # All Unicode code points
  231. for i in range(
  232. 0, 0xffff): # should by 0x10ffff, but non-wide Python build is limited to 16-bits
  233. if i < 0xD800 or i > 0xDFFF: # filter surrogate code points, which are disallowed to encode in UTF-8
  234. vs.append((True, six.unichr(i).encode("utf-8")))
  235. # FIXME: UnicodeEncodeError: 'utf-8' codec can't encode character '\ud800'
  236. # in position 0: surrogates not allowed
  237. if False:
  238. # 5.1 Single UTF-16 surrogates
  239. for i in range(0xD800, 0xDBFF): # high-surrogate
  240. ss = six.unichr(i).encode("utf-8")
  241. vs.append((False, ss))
  242. for i in range(0xDC00, 0xDFFF): # low-surrogate
  243. ss = six.unichr(i).encode("utf-8")
  244. vs.append((False, ss))
  245. # 5.2 Paired UTF-16 surrogates
  246. for i in range(0xD800, 0xDBFF): # high-surrogate
  247. for j in range(0xDC00, 0xDFFF): # low-surrogate
  248. ss1 = six.unichr(i).encode("utf-8")
  249. ss2 = six.unichr(j).encode("utf-8")
  250. vs.append((False, ss1 + ss2))
  251. vs.append((False, ss2 + ss1))
  252. self._TEST_SEQUENCES = vs
  253. def test_standard_utf8validator(self):
  254. """
  255. Test standard implementation of UTF8 validator.
  256. """
  257. validator = StandardUtf8Validator()
  258. return self._test_utf8(validator)
  259. def test_nvx_utf8validator(self):
  260. """
  261. Test NVX implementation of UTF8 validator.
  262. """
  263. validator = NvxUtf8Validator()
  264. return self._test_utf8(validator)
  265. def test_standard_utf8validator_incremental(self):
  266. """
  267. Test standard implementation of UTF8 validator in incremental mode.
  268. """
  269. validator = StandardUtf8Validator()
  270. return self._test_utf8_incremental(validator)
  271. # NVX UTF8 validator lack incremental mode implementation
  272. @unittest.expectedFailure
  273. def test_nvx_utf8validator_incremental(self):
  274. """
  275. Test NVX implementation of UTF8 validator in incremental mode.
  276. """
  277. validator = NvxUtf8Validator()
  278. return self._test_utf8_incremental(validator)
  279. def _test_utf8(self, validator):
  280. for s in self._TEST_SEQUENCES:
  281. validator.reset()
  282. r = validator.validate(s[1])
  283. # no UTF-8 decode error _and_ everything consumed
  284. res = r[0] and r[1]
  285. self.assertEqual(res, s[0])
  286. def _test_utf8_incremental(self, validator, withPositions=True):
  287. # These tests verify that the UTF-8 decoder/validator can operate incrementally.
  288. if withPositions:
  289. # testing validator 4 on incremental detection with positions
  290. k = 4
  291. else:
  292. # testing validator 2 on incremental detection without positions
  293. k = 2
  294. validator.reset()
  295. self.assertEqual((True, True, 15, 15)[:k], validator.validate(u'µ@ßöäüàá'.encode('utf8'))[:k])
  296. validator.reset()
  297. self.assertEqual((False, False, 0, 0)[:k], validator.validate(b"\xF5")[:k])
  298. # the following 3 all fail on eating byte 7 (0xA0)
  299. validator.reset()
  300. self.assertEqual((True, True, 6, 6)[:k], validator.validate(b"\x65\x64\x69\x74\x65\x64")[:k])
  301. self.assertEqual((False, False, 1, 7)[:k], validator.validate(b"\xED\xA0\x80")[:k])
  302. validator.reset()
  303. self.assertEqual((True, True, 4, 4)[:k], validator.validate(b"\x65\x64\x69\x74")[:k])
  304. self.assertEqual((False, False, 3, 7)[:k], validator.validate(b"\x65\x64\xED\xA0\x80")[:k])
  305. validator.reset()
  306. self.assertEqual((True, False, 7, 7)[:k], validator.validate(b"\x65\x64\x69\x74\x65\x64\xED")[:k])
  307. self.assertEqual((False, False, 0, 7)[:k], validator.validate(b"\xA0\x80")[:k])