Funktionierender Prototyp des Serious Games zur Vermittlung von Wissen zu Software-Engineering-Arbeitsmodellen.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utf8validator.py 6.5KB

1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. ###############################################################################
  2. #
  3. # The MIT License (MIT)
  4. #
  5. # Copyright (c) typedef int GmbH
  6. #
  7. # Permission is hereby granted, free of charge, to any person obtaining a copy
  8. # of this software and associated documentation files (the "Software"), to deal
  9. # in the Software without restriction, including without limitation the rights
  10. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  11. # copies of the Software, and to permit persons to whom the Software is
  12. # furnished to do so, subject to the following conditions:
  13. #
  14. # The above copyright notice and this permission notice shall be included in
  15. # all copies or substantial portions of the Software.
  16. #
  17. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  18. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  19. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  20. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  21. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  22. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  23. # THE SOFTWARE.
  24. #
  25. ###############################################################################
  26. # Note: This code is a Python implementation of the algorithm
  27. # "Flexible and Economical UTF-8 Decoder" by Bjoern Hoehrmann
  28. # bjoern@hoehrmann.de, http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  29. __all__ = ("Utf8Validator",)
  30. # DFA transitions
  31. UTF8VALIDATOR_DFA = (
  32. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 00..1f
  33. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 20..3f
  34. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 40..5f
  35. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, # 60..7f
  36. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, # 80..9f
  37. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, # a0..bf
  38. 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, # c0..df
  39. 0xa, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x4, 0x3, 0x3, # e0..ef
  40. 0xb, 0x6, 0x6, 0x6, 0x5, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, 0x8, # f0..ff
  41. 0x0, 0x1, 0x2, 0x3, 0x5, 0x8, 0x7, 0x1, 0x1, 0x1, 0x4, 0x6, 0x1, 0x1, 0x1, 0x1, # s0..s0
  42. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, # s1..s2
  43. 1, 2, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, # s3..s4
  44. 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, # s5..s6
  45. 1, 3, 1, 1, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, # s7..s8
  46. )
  47. UTF8_ACCEPT = 0
  48. UTF8_REJECT = 1
  49. # use Cython implementation of UTF8 validator if available
  50. try:
  51. from wsaccel.utf8validator import Utf8Validator
  52. except ImportError:
  53. # Fallback to pure Python implementation - also for PyPy.
  54. #
  55. # Do NOT touch this code unless you know what you are doing!
  56. # https://github.com/oberstet/scratchbox/tree/master/python/utf8
  57. # Python 3 and above
  58. # convert DFA table to bytes (performance)
  59. UTF8VALIDATOR_DFA_S = bytes(UTF8VALIDATOR_DFA)
  60. class Utf8Validator(object):
  61. """
  62. Incremental UTF-8 validator with constant memory consumption (minimal state).
  63. Implements the algorithm "Flexible and Economical UTF-8 Decoder" by
  64. Bjoern Hoehrmann (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
  65. """
  66. __slots__ = (
  67. '_codepoint',
  68. '_state',
  69. '_index',
  70. )
  71. def __init__(self):
  72. self._codepoint = None
  73. self._state = None
  74. self._index = None
  75. self.reset()
  76. def decode(self, b):
  77. """
  78. Eat one UTF-8 octet, and validate on the fly.
  79. Returns ``UTF8_ACCEPT`` when enough octets have been consumed, in which case
  80. ``self.codepoint`` contains the decoded Unicode code point.
  81. Returns ``UTF8_REJECT`` when invalid UTF-8 was encountered.
  82. Returns some other positive integer when more octets need to be eaten.
  83. """
  84. tt = UTF8VALIDATOR_DFA_S[b]
  85. if self._state != UTF8_ACCEPT:
  86. self._codepoint = (b & 0x3f) | (self._codepoint << 6)
  87. else:
  88. self._codepoint = (0xff >> tt) & b
  89. self._state = UTF8VALIDATOR_DFA_S[256 + self._state * 16 + tt]
  90. return self._state
  91. def reset(self):
  92. """
  93. Reset validator to start new incremental UTF-8 decode/validation.
  94. """
  95. self._state = UTF8_ACCEPT # the empty string is valid UTF8
  96. self._codepoint = 0
  97. self._index = 0
  98. def validate(self, ba):
  99. """
  100. Incrementally validate a chunk of bytes provided as string.
  101. Will return a quad ``(valid?, endsOnCodePoint?, currentIndex, totalIndex)``.
  102. As soon as an octet is encountered which renders the octet sequence
  103. invalid, a quad with ``valid? == False`` is returned. ``currentIndex`` returns
  104. the index within the currently consumed chunk, and ``totalIndex`` the
  105. index within the total consumed sequence that was the point of bail out.
  106. When ``valid? == True``, currentIndex will be ``len(ba)`` and ``totalIndex`` the
  107. total amount of consumed bytes.
  108. """
  109. #
  110. # The code here is written for optimal JITting in PyPy, not for best
  111. # readability by your grandma or particular elegance. Do NOT touch!
  112. #
  113. l = len(ba)
  114. i = 0
  115. state = self._state
  116. while i < l:
  117. # optimized version of decode(), since we are not interested in actual code points
  118. state = UTF8VALIDATOR_DFA_S[256 + (state << 4) + UTF8VALIDATOR_DFA_S[ba[i]]]
  119. if state == UTF8_REJECT:
  120. self._state = state
  121. self._index += i
  122. return False, False, i, self._index
  123. i += 1
  124. self._state = state
  125. self._index += l
  126. return True, state == UTF8_ACCEPT, l, self._index