Funktionierender Prototyp des Serious Games zur Vermittlung von Wissen zu Software-Engineering-Arbeitsmodellen.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

md.py 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582
  1. from functools import lru_cache
  2. from logging import getLogger
  3. from typing import List, Optional
  4. from .constant import (
  5. COMMON_SAFE_ASCII_CHARACTERS,
  6. TRACE,
  7. UNICODE_SECONDARY_RANGE_KEYWORD,
  8. )
  9. from .utils import (
  10. is_accentuated,
  11. is_ascii,
  12. is_case_variable,
  13. is_cjk,
  14. is_emoticon,
  15. is_hangul,
  16. is_hiragana,
  17. is_katakana,
  18. is_latin,
  19. is_punctuation,
  20. is_separator,
  21. is_symbol,
  22. is_thai,
  23. is_unprintable,
  24. remove_accent,
  25. unicode_range,
  26. )
  27. class MessDetectorPlugin:
  28. """
  29. Base abstract class used for mess detection plugins.
  30. All detectors MUST extend and implement given methods.
  31. """
  32. def eligible(self, character: str) -> bool:
  33. """
  34. Determine if given character should be fed in.
  35. """
  36. raise NotImplementedError # pragma: nocover
  37. def feed(self, character: str) -> None:
  38. """
  39. The main routine to be executed upon character.
  40. Insert the logic in witch the text would be considered chaotic.
  41. """
  42. raise NotImplementedError # pragma: nocover
  43. def reset(self) -> None: # pragma: no cover
  44. """
  45. Permit to reset the plugin to the initial state.
  46. """
  47. raise NotImplementedError
  48. @property
  49. def ratio(self) -> float:
  50. """
  51. Compute the chaos ratio based on what your feed() has seen.
  52. Must NOT be lower than 0.; No restriction gt 0.
  53. """
  54. raise NotImplementedError # pragma: nocover
  55. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  56. def __init__(self) -> None:
  57. self._punctuation_count: int = 0
  58. self._symbol_count: int = 0
  59. self._character_count: int = 0
  60. self._last_printable_char: Optional[str] = None
  61. self._frenzy_symbol_in_word: bool = False
  62. def eligible(self, character: str) -> bool:
  63. return character.isprintable()
  64. def feed(self, character: str) -> None:
  65. self._character_count += 1
  66. if (
  67. character != self._last_printable_char
  68. and character not in COMMON_SAFE_ASCII_CHARACTERS
  69. ):
  70. if is_punctuation(character):
  71. self._punctuation_count += 1
  72. elif (
  73. character.isdigit() is False
  74. and is_symbol(character)
  75. and is_emoticon(character) is False
  76. ):
  77. self._symbol_count += 2
  78. self._last_printable_char = character
  79. def reset(self) -> None: # pragma: no cover
  80. self._punctuation_count = 0
  81. self._character_count = 0
  82. self._symbol_count = 0
  83. @property
  84. def ratio(self) -> float:
  85. if self._character_count == 0:
  86. return 0.0
  87. ratio_of_punctuation: float = (
  88. self._punctuation_count + self._symbol_count
  89. ) / self._character_count
  90. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  91. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  92. def __init__(self) -> None:
  93. self._character_count: int = 0
  94. self._accentuated_count: int = 0
  95. def eligible(self, character: str) -> bool:
  96. return character.isalpha()
  97. def feed(self, character: str) -> None:
  98. self._character_count += 1
  99. if is_accentuated(character):
  100. self._accentuated_count += 1
  101. def reset(self) -> None: # pragma: no cover
  102. self._character_count = 0
  103. self._accentuated_count = 0
  104. @property
  105. def ratio(self) -> float:
  106. if self._character_count == 0 or self._character_count < 8:
  107. return 0.0
  108. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  109. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  110. class UnprintablePlugin(MessDetectorPlugin):
  111. def __init__(self) -> None:
  112. self._unprintable_count: int = 0
  113. self._character_count: int = 0
  114. def eligible(self, character: str) -> bool:
  115. return True
  116. def feed(self, character: str) -> None:
  117. if is_unprintable(character):
  118. self._unprintable_count += 1
  119. self._character_count += 1
  120. def reset(self) -> None: # pragma: no cover
  121. self._unprintable_count = 0
  122. @property
  123. def ratio(self) -> float:
  124. if self._character_count == 0:
  125. return 0.0
  126. return (self._unprintable_count * 8) / self._character_count
  127. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  128. def __init__(self) -> None:
  129. self._successive_count: int = 0
  130. self._character_count: int = 0
  131. self._last_latin_character: Optional[str] = None
  132. def eligible(self, character: str) -> bool:
  133. return character.isalpha() and is_latin(character)
  134. def feed(self, character: str) -> None:
  135. self._character_count += 1
  136. if (
  137. self._last_latin_character is not None
  138. and is_accentuated(character)
  139. and is_accentuated(self._last_latin_character)
  140. ):
  141. if character.isupper() and self._last_latin_character.isupper():
  142. self._successive_count += 1
  143. # Worse if its the same char duplicated with different accent.
  144. if remove_accent(character) == remove_accent(self._last_latin_character):
  145. self._successive_count += 1
  146. self._last_latin_character = character
  147. def reset(self) -> None: # pragma: no cover
  148. self._successive_count = 0
  149. self._character_count = 0
  150. self._last_latin_character = None
  151. @property
  152. def ratio(self) -> float:
  153. if self._character_count == 0:
  154. return 0.0
  155. return (self._successive_count * 2) / self._character_count
  156. class SuspiciousRange(MessDetectorPlugin):
  157. def __init__(self) -> None:
  158. self._suspicious_successive_range_count: int = 0
  159. self._character_count: int = 0
  160. self._last_printable_seen: Optional[str] = None
  161. def eligible(self, character: str) -> bool:
  162. return character.isprintable()
  163. def feed(self, character: str) -> None:
  164. self._character_count += 1
  165. if (
  166. character.isspace()
  167. or is_punctuation(character)
  168. or character in COMMON_SAFE_ASCII_CHARACTERS
  169. ):
  170. self._last_printable_seen = None
  171. return
  172. if self._last_printable_seen is None:
  173. self._last_printable_seen = character
  174. return
  175. unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
  176. unicode_range_b: Optional[str] = unicode_range(character)
  177. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  178. self._suspicious_successive_range_count += 1
  179. self._last_printable_seen = character
  180. def reset(self) -> None: # pragma: no cover
  181. self._character_count = 0
  182. self._suspicious_successive_range_count = 0
  183. self._last_printable_seen = None
  184. @property
  185. def ratio(self) -> float:
  186. if self._character_count == 0:
  187. return 0.0
  188. ratio_of_suspicious_range_usage: float = (
  189. self._suspicious_successive_range_count * 2
  190. ) / self._character_count
  191. if ratio_of_suspicious_range_usage < 0.1:
  192. return 0.0
  193. return ratio_of_suspicious_range_usage
  194. class SuperWeirdWordPlugin(MessDetectorPlugin):
  195. def __init__(self) -> None:
  196. self._word_count: int = 0
  197. self._bad_word_count: int = 0
  198. self._foreign_long_count: int = 0
  199. self._is_current_word_bad: bool = False
  200. self._foreign_long_watch: bool = False
  201. self._character_count: int = 0
  202. self._bad_character_count: int = 0
  203. self._buffer: str = ""
  204. self._buffer_accent_count: int = 0
  205. def eligible(self, character: str) -> bool:
  206. return True
  207. def feed(self, character: str) -> None:
  208. if character.isalpha():
  209. self._buffer += character
  210. if is_accentuated(character):
  211. self._buffer_accent_count += 1
  212. if (
  213. self._foreign_long_watch is False
  214. and (is_latin(character) is False or is_accentuated(character))
  215. and is_cjk(character) is False
  216. and is_hangul(character) is False
  217. and is_katakana(character) is False
  218. and is_hiragana(character) is False
  219. and is_thai(character) is False
  220. ):
  221. self._foreign_long_watch = True
  222. return
  223. if not self._buffer:
  224. return
  225. if (
  226. character.isspace() or is_punctuation(character) or is_separator(character)
  227. ) and self._buffer:
  228. self._word_count += 1
  229. buffer_length: int = len(self._buffer)
  230. self._character_count += buffer_length
  231. if buffer_length >= 4:
  232. if self._buffer_accent_count / buffer_length > 0.34:
  233. self._is_current_word_bad = True
  234. # Word/Buffer ending with an upper case accentuated letter are so rare,
  235. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  236. if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
  237. self._foreign_long_count += 1
  238. self._is_current_word_bad = True
  239. if buffer_length >= 24 and self._foreign_long_watch:
  240. camel_case_dst = [
  241. i
  242. for c, i in zip(self._buffer, range(0, buffer_length))
  243. if c.isupper()
  244. ]
  245. probable_camel_cased: bool = False
  246. if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
  247. probable_camel_cased = True
  248. if not probable_camel_cased:
  249. self._foreign_long_count += 1
  250. self._is_current_word_bad = True
  251. if self._is_current_word_bad:
  252. self._bad_word_count += 1
  253. self._bad_character_count += len(self._buffer)
  254. self._is_current_word_bad = False
  255. self._foreign_long_watch = False
  256. self._buffer = ""
  257. self._buffer_accent_count = 0
  258. elif (
  259. character not in {"<", ">", "-", "=", "~", "|", "_"}
  260. and character.isdigit() is False
  261. and is_symbol(character)
  262. ):
  263. self._is_current_word_bad = True
  264. self._buffer += character
  265. def reset(self) -> None: # pragma: no cover
  266. self._buffer = ""
  267. self._is_current_word_bad = False
  268. self._foreign_long_watch = False
  269. self._bad_word_count = 0
  270. self._word_count = 0
  271. self._character_count = 0
  272. self._bad_character_count = 0
  273. self._foreign_long_count = 0
  274. @property
  275. def ratio(self) -> float:
  276. if self._word_count <= 10 and self._foreign_long_count == 0:
  277. return 0.0
  278. return self._bad_character_count / self._character_count
  279. class CjkInvalidStopPlugin(MessDetectorPlugin):
  280. """
  281. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
  282. can be easily detected. Searching for the overuse of '丅' and '丄'.
  283. """
  284. def __init__(self) -> None:
  285. self._wrong_stop_count: int = 0
  286. self._cjk_character_count: int = 0
  287. def eligible(self, character: str) -> bool:
  288. return True
  289. def feed(self, character: str) -> None:
  290. if character in {"丅", "丄"}:
  291. self._wrong_stop_count += 1
  292. return
  293. if is_cjk(character):
  294. self._cjk_character_count += 1
  295. def reset(self) -> None: # pragma: no cover
  296. self._wrong_stop_count = 0
  297. self._cjk_character_count = 0
  298. @property
  299. def ratio(self) -> float:
  300. if self._cjk_character_count < 16:
  301. return 0.0
  302. return self._wrong_stop_count / self._cjk_character_count
  303. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  304. def __init__(self) -> None:
  305. self._buf: bool = False
  306. self._character_count_since_last_sep: int = 0
  307. self._successive_upper_lower_count: int = 0
  308. self._successive_upper_lower_count_final: int = 0
  309. self._character_count: int = 0
  310. self._last_alpha_seen: Optional[str] = None
  311. self._current_ascii_only: bool = True
  312. def eligible(self, character: str) -> bool:
  313. return True
  314. def feed(self, character: str) -> None:
  315. is_concerned = character.isalpha() and is_case_variable(character)
  316. chunk_sep = is_concerned is False
  317. if chunk_sep and self._character_count_since_last_sep > 0:
  318. if (
  319. self._character_count_since_last_sep <= 64
  320. and character.isdigit() is False
  321. and self._current_ascii_only is False
  322. ):
  323. self._successive_upper_lower_count_final += (
  324. self._successive_upper_lower_count
  325. )
  326. self._successive_upper_lower_count = 0
  327. self._character_count_since_last_sep = 0
  328. self._last_alpha_seen = None
  329. self._buf = False
  330. self._character_count += 1
  331. self._current_ascii_only = True
  332. return
  333. if self._current_ascii_only is True and is_ascii(character) is False:
  334. self._current_ascii_only = False
  335. if self._last_alpha_seen is not None:
  336. if (character.isupper() and self._last_alpha_seen.islower()) or (
  337. character.islower() and self._last_alpha_seen.isupper()
  338. ):
  339. if self._buf is True:
  340. self._successive_upper_lower_count += 2
  341. self._buf = False
  342. else:
  343. self._buf = True
  344. else:
  345. self._buf = False
  346. self._character_count += 1
  347. self._character_count_since_last_sep += 1
  348. self._last_alpha_seen = character
  349. def reset(self) -> None: # pragma: no cover
  350. self._character_count = 0
  351. self._character_count_since_last_sep = 0
  352. self._successive_upper_lower_count = 0
  353. self._successive_upper_lower_count_final = 0
  354. self._last_alpha_seen = None
  355. self._buf = False
  356. self._current_ascii_only = True
  357. @property
  358. def ratio(self) -> float:
  359. if self._character_count == 0:
  360. return 0.0
  361. return self._successive_upper_lower_count_final / self._character_count
  362. @lru_cache(maxsize=1024)
  363. def is_suspiciously_successive_range(
  364. unicode_range_a: Optional[str], unicode_range_b: Optional[str]
  365. ) -> bool:
  366. """
  367. Determine if two Unicode range seen next to each other can be considered as suspicious.
  368. """
  369. if unicode_range_a is None or unicode_range_b is None:
  370. return True
  371. if unicode_range_a == unicode_range_b:
  372. return False
  373. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  374. return False
  375. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  376. return False
  377. # Latin characters can be accompanied with a combining diacritical mark
  378. # eg. Vietnamese.
  379. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  380. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  381. ):
  382. return False
  383. keywords_range_a, keywords_range_b = unicode_range_a.split(
  384. " "
  385. ), unicode_range_b.split(" ")
  386. for el in keywords_range_a:
  387. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  388. continue
  389. if el in keywords_range_b:
  390. return False
  391. # Japanese Exception
  392. range_a_jp_chars, range_b_jp_chars = (
  393. unicode_range_a
  394. in (
  395. "Hiragana",
  396. "Katakana",
  397. ),
  398. unicode_range_b in ("Hiragana", "Katakana"),
  399. )
  400. if (range_a_jp_chars or range_b_jp_chars) and (
  401. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  402. ):
  403. return False
  404. if range_a_jp_chars and range_b_jp_chars:
  405. return False
  406. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  407. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  408. return False
  409. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  410. return False
  411. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  412. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  413. unicode_range_a in ["Katakana", "Hiragana"]
  414. and unicode_range_b in ["Katakana", "Hiragana"]
  415. ):
  416. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  417. return False
  418. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  419. return False
  420. return True
  421. @lru_cache(maxsize=2048)
  422. def mess_ratio(
  423. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  424. ) -> float:
  425. """
  426. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  427. """
  428. detectors: List[MessDetectorPlugin] = [
  429. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  430. ]
  431. length: int = len(decoded_sequence) + 1
  432. mean_mess_ratio: float = 0.0
  433. if length < 512:
  434. intermediary_mean_mess_ratio_calc: int = 32
  435. elif length <= 1024:
  436. intermediary_mean_mess_ratio_calc = 64
  437. else:
  438. intermediary_mean_mess_ratio_calc = 128
  439. for character, index in zip(decoded_sequence + "\n", range(length)):
  440. for detector in detectors:
  441. if detector.eligible(character):
  442. detector.feed(character)
  443. if (
  444. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  445. ) or index == length - 1:
  446. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  447. if mean_mess_ratio >= maximum_threshold:
  448. break
  449. if debug:
  450. logger = getLogger("charset_normalizer")
  451. logger.log(
  452. TRACE,
  453. "Mess-detector extended-analysis start. "
  454. f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
  455. f"maximum_threshold={maximum_threshold}",
  456. )
  457. if len(decoded_sequence) > 16:
  458. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  459. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  460. for dt in detectors: # pragma: nocover
  461. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  462. return round(mean_mess_ratio, 3)