Funktionierender Prototyp des Serious Games zur Vermittlung von Wissen zu Software-Engineering-Arbeitsmodellen.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414
  1. import importlib
  2. import logging
  3. import unicodedata
  4. from codecs import IncrementalDecoder
  5. from encodings.aliases import aliases
  6. from functools import lru_cache
  7. from re import findall
  8. from typing import Generator, List, Optional, Set, Tuple, Union
  9. from _multibytecodec import MultibyteIncrementalDecoder
  10. from .constant import (
  11. ENCODING_MARKS,
  12. IANA_SUPPORTED_SIMILAR,
  13. RE_POSSIBLE_ENCODING_INDICATION,
  14. UNICODE_RANGES_COMBINED,
  15. UNICODE_SECONDARY_RANGE_KEYWORD,
  16. UTF8_MAXIMAL_ALLOCATION,
  17. )
  18. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  19. def is_accentuated(character: str) -> bool:
  20. try:
  21. description: str = unicodedata.name(character)
  22. except ValueError:
  23. return False
  24. return (
  25. "WITH GRAVE" in description
  26. or "WITH ACUTE" in description
  27. or "WITH CEDILLA" in description
  28. or "WITH DIAERESIS" in description
  29. or "WITH CIRCUMFLEX" in description
  30. or "WITH TILDE" in description
  31. )
  32. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  33. def remove_accent(character: str) -> str:
  34. decomposed: str = unicodedata.decomposition(character)
  35. if not decomposed:
  36. return character
  37. codes: List[str] = decomposed.split(" ")
  38. return chr(int(codes[0], 16))
  39. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  40. def unicode_range(character: str) -> Optional[str]:
  41. """
  42. Retrieve the Unicode range official name from a single character.
  43. """
  44. character_ord: int = ord(character)
  45. for range_name, ord_range in UNICODE_RANGES_COMBINED.items():
  46. if character_ord in ord_range:
  47. return range_name
  48. return None
  49. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  50. def is_latin(character: str) -> bool:
  51. try:
  52. description: str = unicodedata.name(character)
  53. except ValueError:
  54. return False
  55. return "LATIN" in description
  56. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  57. def is_ascii(character: str) -> bool:
  58. try:
  59. character.encode("ascii")
  60. except UnicodeEncodeError:
  61. return False
  62. return True
  63. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  64. def is_punctuation(character: str) -> bool:
  65. character_category: str = unicodedata.category(character)
  66. if "P" in character_category:
  67. return True
  68. character_range: Optional[str] = unicode_range(character)
  69. if character_range is None:
  70. return False
  71. return "Punctuation" in character_range
  72. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  73. def is_symbol(character: str) -> bool:
  74. character_category: str = unicodedata.category(character)
  75. if "S" in character_category or "N" in character_category:
  76. return True
  77. character_range: Optional[str] = unicode_range(character)
  78. if character_range is None:
  79. return False
  80. return "Forms" in character_range
  81. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  82. def is_emoticon(character: str) -> bool:
  83. character_range: Optional[str] = unicode_range(character)
  84. if character_range is None:
  85. return False
  86. return "Emoticons" in character_range
  87. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  88. def is_separator(character: str) -> bool:
  89. if character.isspace() or character in {"|", "+", "<", ">"}:
  90. return True
  91. character_category: str = unicodedata.category(character)
  92. return "Z" in character_category or character_category in {"Po", "Pd", "Pc"}
  93. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  94. def is_case_variable(character: str) -> bool:
  95. return character.islower() != character.isupper()
  96. def is_private_use_only(character: str) -> bool:
  97. character_category: str = unicodedata.category(character)
  98. return character_category == "Co"
  99. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  100. def is_cjk(character: str) -> bool:
  101. try:
  102. character_name = unicodedata.name(character)
  103. except ValueError:
  104. return False
  105. return "CJK" in character_name
  106. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  107. def is_hiragana(character: str) -> bool:
  108. try:
  109. character_name = unicodedata.name(character)
  110. except ValueError:
  111. return False
  112. return "HIRAGANA" in character_name
  113. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  114. def is_katakana(character: str) -> bool:
  115. try:
  116. character_name = unicodedata.name(character)
  117. except ValueError:
  118. return False
  119. return "KATAKANA" in character_name
  120. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  121. def is_hangul(character: str) -> bool:
  122. try:
  123. character_name = unicodedata.name(character)
  124. except ValueError:
  125. return False
  126. return "HANGUL" in character_name
  127. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  128. def is_thai(character: str) -> bool:
  129. try:
  130. character_name = unicodedata.name(character)
  131. except ValueError:
  132. return False
  133. return "THAI" in character_name
  134. @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED))
  135. def is_unicode_range_secondary(range_name: str) -> bool:
  136. return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD)
  137. @lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION)
  138. def is_unprintable(character: str) -> bool:
  139. return (
  140. character.isspace() is False # includes \n \t \r \v
  141. and character.isprintable() is False
  142. and character != "\x1A" # Why? Its the ASCII substitute character.
  143. and character != "\ufeff" # bug discovered in Python,
  144. # Zero Width No-Break Space located in Arabic Presentation Forms-B, Unicode 1.1 not acknowledged as space.
  145. )
  146. def any_specified_encoding(sequence: bytes, search_zone: int = 4096) -> Optional[str]:
  147. """
  148. Extract using ASCII-only decoder any specified encoding in the first n-bytes.
  149. """
  150. if not isinstance(sequence, bytes):
  151. raise TypeError
  152. seq_len: int = len(sequence)
  153. results: List[str] = findall(
  154. RE_POSSIBLE_ENCODING_INDICATION,
  155. sequence[: min(seq_len, search_zone)].decode("ascii", errors="ignore"),
  156. )
  157. if len(results) == 0:
  158. return None
  159. for specified_encoding in results:
  160. specified_encoding = specified_encoding.lower().replace("-", "_")
  161. encoding_alias: str
  162. encoding_iana: str
  163. for encoding_alias, encoding_iana in aliases.items():
  164. if encoding_alias == specified_encoding:
  165. return encoding_iana
  166. if encoding_iana == specified_encoding:
  167. return encoding_iana
  168. return None
  169. @lru_cache(maxsize=128)
  170. def is_multi_byte_encoding(name: str) -> bool:
  171. """
  172. Verify is a specific encoding is a multi byte one based on it IANA name
  173. """
  174. return name in {
  175. "utf_8",
  176. "utf_8_sig",
  177. "utf_16",
  178. "utf_16_be",
  179. "utf_16_le",
  180. "utf_32",
  181. "utf_32_le",
  182. "utf_32_be",
  183. "utf_7",
  184. } or issubclass(
  185. importlib.import_module("encodings.{}".format(name)).IncrementalDecoder,
  186. MultibyteIncrementalDecoder,
  187. )
  188. def identify_sig_or_bom(sequence: bytes) -> Tuple[Optional[str], bytes]:
  189. """
  190. Identify and extract SIG/BOM in given sequence.
  191. """
  192. for iana_encoding in ENCODING_MARKS:
  193. marks: Union[bytes, List[bytes]] = ENCODING_MARKS[iana_encoding]
  194. if isinstance(marks, bytes):
  195. marks = [marks]
  196. for mark in marks:
  197. if sequence.startswith(mark):
  198. return iana_encoding, mark
  199. return None, b""
  200. def should_strip_sig_or_bom(iana_encoding: str) -> bool:
  201. return iana_encoding not in {"utf_16", "utf_32"}
  202. def iana_name(cp_name: str, strict: bool = True) -> str:
  203. cp_name = cp_name.lower().replace("-", "_")
  204. encoding_alias: str
  205. encoding_iana: str
  206. for encoding_alias, encoding_iana in aliases.items():
  207. if cp_name in [encoding_alias, encoding_iana]:
  208. return encoding_iana
  209. if strict:
  210. raise ValueError("Unable to retrieve IANA for '{}'".format(cp_name))
  211. return cp_name
  212. def range_scan(decoded_sequence: str) -> List[str]:
  213. ranges: Set[str] = set()
  214. for character in decoded_sequence:
  215. character_range: Optional[str] = unicode_range(character)
  216. if character_range is None:
  217. continue
  218. ranges.add(character_range)
  219. return list(ranges)
  220. def cp_similarity(iana_name_a: str, iana_name_b: str) -> float:
  221. if is_multi_byte_encoding(iana_name_a) or is_multi_byte_encoding(iana_name_b):
  222. return 0.0
  223. decoder_a = importlib.import_module(
  224. "encodings.{}".format(iana_name_a)
  225. ).IncrementalDecoder
  226. decoder_b = importlib.import_module(
  227. "encodings.{}".format(iana_name_b)
  228. ).IncrementalDecoder
  229. id_a: IncrementalDecoder = decoder_a(errors="ignore")
  230. id_b: IncrementalDecoder = decoder_b(errors="ignore")
  231. character_match_count: int = 0
  232. for i in range(255):
  233. to_be_decoded: bytes = bytes([i])
  234. if id_a.decode(to_be_decoded) == id_b.decode(to_be_decoded):
  235. character_match_count += 1
  236. return character_match_count / 254
  237. def is_cp_similar(iana_name_a: str, iana_name_b: str) -> bool:
  238. """
  239. Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
  240. the function cp_similarity.
  241. """
  242. return (
  243. iana_name_a in IANA_SUPPORTED_SIMILAR
  244. and iana_name_b in IANA_SUPPORTED_SIMILAR[iana_name_a]
  245. )
  246. def set_logging_handler(
  247. name: str = "charset_normalizer",
  248. level: int = logging.INFO,
  249. format_string: str = "%(asctime)s | %(levelname)s | %(message)s",
  250. ) -> None:
  251. logger = logging.getLogger(name)
  252. logger.setLevel(level)
  253. handler = logging.StreamHandler()
  254. handler.setFormatter(logging.Formatter(format_string))
  255. logger.addHandler(handler)
  256. def cut_sequence_chunks(
  257. sequences: bytes,
  258. encoding_iana: str,
  259. offsets: range,
  260. chunk_size: int,
  261. bom_or_sig_available: bool,
  262. strip_sig_or_bom: bool,
  263. sig_payload: bytes,
  264. is_multi_byte_decoder: bool,
  265. decoded_payload: Optional[str] = None,
  266. ) -> Generator[str, None, None]:
  267. if decoded_payload and is_multi_byte_decoder is False:
  268. for i in offsets:
  269. chunk = decoded_payload[i : i + chunk_size]
  270. if not chunk:
  271. break
  272. yield chunk
  273. else:
  274. for i in offsets:
  275. chunk_end = i + chunk_size
  276. if chunk_end > len(sequences) + 8:
  277. continue
  278. cut_sequence = sequences[i : i + chunk_size]
  279. if bom_or_sig_available and strip_sig_or_bom is False:
  280. cut_sequence = sig_payload + cut_sequence
  281. chunk = cut_sequence.decode(
  282. encoding_iana,
  283. errors="ignore" if is_multi_byte_decoder else "strict",
  284. )
  285. # multi-byte bad cutting detector and adjustment
  286. # not the cleanest way to perform that fix but clever enough for now.
  287. if is_multi_byte_decoder and i > 0:
  288. chunk_partial_size_chk: int = min(chunk_size, 16)
  289. if (
  290. decoded_payload
  291. and chunk[:chunk_partial_size_chk] not in decoded_payload
  292. ):
  293. for j in range(i, i - 4, -1):
  294. cut_sequence = sequences[j:chunk_end]
  295. if bom_or_sig_available and strip_sig_or_bom is False:
  296. cut_sequence = sig_payload + cut_sequence
  297. chunk = cut_sequence.decode(encoding_iana, errors="ignore")
  298. if chunk[:chunk_partial_size_chk] in decoded_payload:
  299. break
  300. yield chunk