|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582 |
- from functools import lru_cache
- from logging import getLogger
- from typing import List, Optional
-
- from .constant import (
- COMMON_SAFE_ASCII_CHARACTERS,
- TRACE,
- UNICODE_SECONDARY_RANGE_KEYWORD,
- )
- from .utils import (
- is_accentuated,
- is_ascii,
- is_case_variable,
- is_cjk,
- is_emoticon,
- is_hangul,
- is_hiragana,
- is_katakana,
- is_latin,
- is_punctuation,
- is_separator,
- is_symbol,
- is_thai,
- is_unprintable,
- remove_accent,
- unicode_range,
- )
-
-
- class MessDetectorPlugin:
- """
- Base abstract class used for mess detection plugins.
- All detectors MUST extend and implement given methods.
- """
-
- def eligible(self, character: str) -> bool:
- """
- Determine if given character should be fed in.
- """
- raise NotImplementedError # pragma: nocover
-
- def feed(self, character: str) -> None:
- """
- The main routine to be executed upon character.
- Insert the logic in witch the text would be considered chaotic.
- """
- raise NotImplementedError # pragma: nocover
-
- def reset(self) -> None: # pragma: no cover
- """
- Permit to reset the plugin to the initial state.
- """
- raise NotImplementedError
-
- @property
- def ratio(self) -> float:
- """
- Compute the chaos ratio based on what your feed() has seen.
- Must NOT be lower than 0.; No restriction gt 0.
- """
- raise NotImplementedError # pragma: nocover
-
-
- class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._punctuation_count: int = 0
- self._symbol_count: int = 0
- self._character_count: int = 0
-
- self._last_printable_char: Optional[str] = None
- self._frenzy_symbol_in_word: bool = False
-
- def eligible(self, character: str) -> bool:
- return character.isprintable()
-
- def feed(self, character: str) -> None:
- self._character_count += 1
-
- if (
- character != self._last_printable_char
- and character not in COMMON_SAFE_ASCII_CHARACTERS
- ):
- if is_punctuation(character):
- self._punctuation_count += 1
- elif (
- character.isdigit() is False
- and is_symbol(character)
- and is_emoticon(character) is False
- ):
- self._symbol_count += 2
-
- self._last_printable_char = character
-
- def reset(self) -> None: # pragma: no cover
- self._punctuation_count = 0
- self._character_count = 0
- self._symbol_count = 0
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
-
- ratio_of_punctuation: float = (
- self._punctuation_count + self._symbol_count
- ) / self._character_count
-
- return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
-
-
- class TooManyAccentuatedPlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._character_count: int = 0
- self._accentuated_count: int = 0
-
- def eligible(self, character: str) -> bool:
- return character.isalpha()
-
- def feed(self, character: str) -> None:
- self._character_count += 1
-
- if is_accentuated(character):
- self._accentuated_count += 1
-
- def reset(self) -> None: # pragma: no cover
- self._character_count = 0
- self._accentuated_count = 0
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0 or self._character_count < 8:
- return 0.0
- ratio_of_accentuation: float = self._accentuated_count / self._character_count
- return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
-
-
- class UnprintablePlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._unprintable_count: int = 0
- self._character_count: int = 0
-
- def eligible(self, character: str) -> bool:
- return True
-
- def feed(self, character: str) -> None:
- if is_unprintable(character):
- self._unprintable_count += 1
- self._character_count += 1
-
- def reset(self) -> None: # pragma: no cover
- self._unprintable_count = 0
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
-
- return (self._unprintable_count * 8) / self._character_count
-
-
- class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._successive_count: int = 0
- self._character_count: int = 0
-
- self._last_latin_character: Optional[str] = None
-
- def eligible(self, character: str) -> bool:
- return character.isalpha() and is_latin(character)
-
- def feed(self, character: str) -> None:
- self._character_count += 1
- if (
- self._last_latin_character is not None
- and is_accentuated(character)
- and is_accentuated(self._last_latin_character)
- ):
- if character.isupper() and self._last_latin_character.isupper():
- self._successive_count += 1
- # Worse if its the same char duplicated with different accent.
- if remove_accent(character) == remove_accent(self._last_latin_character):
- self._successive_count += 1
- self._last_latin_character = character
-
- def reset(self) -> None: # pragma: no cover
- self._successive_count = 0
- self._character_count = 0
- self._last_latin_character = None
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
-
- return (self._successive_count * 2) / self._character_count
-
-
- class SuspiciousRange(MessDetectorPlugin):
- def __init__(self) -> None:
- self._suspicious_successive_range_count: int = 0
- self._character_count: int = 0
- self._last_printable_seen: Optional[str] = None
-
- def eligible(self, character: str) -> bool:
- return character.isprintable()
-
- def feed(self, character: str) -> None:
- self._character_count += 1
-
- if (
- character.isspace()
- or is_punctuation(character)
- or character in COMMON_SAFE_ASCII_CHARACTERS
- ):
- self._last_printable_seen = None
- return
-
- if self._last_printable_seen is None:
- self._last_printable_seen = character
- return
-
- unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
- unicode_range_b: Optional[str] = unicode_range(character)
-
- if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
- self._suspicious_successive_range_count += 1
-
- self._last_printable_seen = character
-
- def reset(self) -> None: # pragma: no cover
- self._character_count = 0
- self._suspicious_successive_range_count = 0
- self._last_printable_seen = None
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
-
- ratio_of_suspicious_range_usage: float = (
- self._suspicious_successive_range_count * 2
- ) / self._character_count
-
- if ratio_of_suspicious_range_usage < 0.1:
- return 0.0
-
- return ratio_of_suspicious_range_usage
-
-
- class SuperWeirdWordPlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._word_count: int = 0
- self._bad_word_count: int = 0
- self._foreign_long_count: int = 0
-
- self._is_current_word_bad: bool = False
- self._foreign_long_watch: bool = False
-
- self._character_count: int = 0
- self._bad_character_count: int = 0
-
- self._buffer: str = ""
- self._buffer_accent_count: int = 0
-
- def eligible(self, character: str) -> bool:
- return True
-
- def feed(self, character: str) -> None:
- if character.isalpha():
- self._buffer += character
- if is_accentuated(character):
- self._buffer_accent_count += 1
- if (
- self._foreign_long_watch is False
- and (is_latin(character) is False or is_accentuated(character))
- and is_cjk(character) is False
- and is_hangul(character) is False
- and is_katakana(character) is False
- and is_hiragana(character) is False
- and is_thai(character) is False
- ):
- self._foreign_long_watch = True
- return
- if not self._buffer:
- return
- if (
- character.isspace() or is_punctuation(character) or is_separator(character)
- ) and self._buffer:
- self._word_count += 1
- buffer_length: int = len(self._buffer)
-
- self._character_count += buffer_length
-
- if buffer_length >= 4:
- if self._buffer_accent_count / buffer_length > 0.34:
- self._is_current_word_bad = True
- # Word/Buffer ending with an upper case accentuated letter are so rare,
- # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
- if is_accentuated(self._buffer[-1]) and self._buffer[-1].isupper():
- self._foreign_long_count += 1
- self._is_current_word_bad = True
- if buffer_length >= 24 and self._foreign_long_watch:
- camel_case_dst = [
- i
- for c, i in zip(self._buffer, range(0, buffer_length))
- if c.isupper()
- ]
- probable_camel_cased: bool = False
-
- if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
- probable_camel_cased = True
-
- if not probable_camel_cased:
- self._foreign_long_count += 1
- self._is_current_word_bad = True
-
- if self._is_current_word_bad:
- self._bad_word_count += 1
- self._bad_character_count += len(self._buffer)
- self._is_current_word_bad = False
-
- self._foreign_long_watch = False
- self._buffer = ""
- self._buffer_accent_count = 0
- elif (
- character not in {"<", ">", "-", "=", "~", "|", "_"}
- and character.isdigit() is False
- and is_symbol(character)
- ):
- self._is_current_word_bad = True
- self._buffer += character
-
- def reset(self) -> None: # pragma: no cover
- self._buffer = ""
- self._is_current_word_bad = False
- self._foreign_long_watch = False
- self._bad_word_count = 0
- self._word_count = 0
- self._character_count = 0
- self._bad_character_count = 0
- self._foreign_long_count = 0
-
- @property
- def ratio(self) -> float:
- if self._word_count <= 10 and self._foreign_long_count == 0:
- return 0.0
-
- return self._bad_character_count / self._character_count
-
-
- class CjkInvalidStopPlugin(MessDetectorPlugin):
- """
- GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
- can be easily detected. Searching for the overuse of '丅' and '丄'.
- """
-
- def __init__(self) -> None:
- self._wrong_stop_count: int = 0
- self._cjk_character_count: int = 0
-
- def eligible(self, character: str) -> bool:
- return True
-
- def feed(self, character: str) -> None:
- if character in {"丅", "丄"}:
- self._wrong_stop_count += 1
- return
- if is_cjk(character):
- self._cjk_character_count += 1
-
- def reset(self) -> None: # pragma: no cover
- self._wrong_stop_count = 0
- self._cjk_character_count = 0
-
- @property
- def ratio(self) -> float:
- if self._cjk_character_count < 16:
- return 0.0
- return self._wrong_stop_count / self._cjk_character_count
-
-
- class ArchaicUpperLowerPlugin(MessDetectorPlugin):
- def __init__(self) -> None:
- self._buf: bool = False
-
- self._character_count_since_last_sep: int = 0
-
- self._successive_upper_lower_count: int = 0
- self._successive_upper_lower_count_final: int = 0
-
- self._character_count: int = 0
-
- self._last_alpha_seen: Optional[str] = None
- self._current_ascii_only: bool = True
-
- def eligible(self, character: str) -> bool:
- return True
-
- def feed(self, character: str) -> None:
- is_concerned = character.isalpha() and is_case_variable(character)
- chunk_sep = is_concerned is False
-
- if chunk_sep and self._character_count_since_last_sep > 0:
- if (
- self._character_count_since_last_sep <= 64
- and character.isdigit() is False
- and self._current_ascii_only is False
- ):
- self._successive_upper_lower_count_final += (
- self._successive_upper_lower_count
- )
-
- self._successive_upper_lower_count = 0
- self._character_count_since_last_sep = 0
- self._last_alpha_seen = None
- self._buf = False
- self._character_count += 1
- self._current_ascii_only = True
-
- return
-
- if self._current_ascii_only is True and is_ascii(character) is False:
- self._current_ascii_only = False
-
- if self._last_alpha_seen is not None:
- if (character.isupper() and self._last_alpha_seen.islower()) or (
- character.islower() and self._last_alpha_seen.isupper()
- ):
- if self._buf is True:
- self._successive_upper_lower_count += 2
- self._buf = False
- else:
- self._buf = True
- else:
- self._buf = False
-
- self._character_count += 1
- self._character_count_since_last_sep += 1
- self._last_alpha_seen = character
-
- def reset(self) -> None: # pragma: no cover
- self._character_count = 0
- self._character_count_since_last_sep = 0
- self._successive_upper_lower_count = 0
- self._successive_upper_lower_count_final = 0
- self._last_alpha_seen = None
- self._buf = False
- self._current_ascii_only = True
-
- @property
- def ratio(self) -> float:
- if self._character_count == 0:
- return 0.0
-
- return self._successive_upper_lower_count_final / self._character_count
-
-
- @lru_cache(maxsize=1024)
- def is_suspiciously_successive_range(
- unicode_range_a: Optional[str], unicode_range_b: Optional[str]
- ) -> bool:
- """
- Determine if two Unicode range seen next to each other can be considered as suspicious.
- """
- if unicode_range_a is None or unicode_range_b is None:
- return True
-
- if unicode_range_a == unicode_range_b:
- return False
-
- if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
- return False
-
- if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
- return False
-
- # Latin characters can be accompanied with a combining diacritical mark
- # eg. Vietnamese.
- if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
- "Combining" in unicode_range_a or "Combining" in unicode_range_b
- ):
- return False
-
- keywords_range_a, keywords_range_b = unicode_range_a.split(
- " "
- ), unicode_range_b.split(" ")
-
- for el in keywords_range_a:
- if el in UNICODE_SECONDARY_RANGE_KEYWORD:
- continue
- if el in keywords_range_b:
- return False
-
- # Japanese Exception
- range_a_jp_chars, range_b_jp_chars = (
- unicode_range_a
- in (
- "Hiragana",
- "Katakana",
- ),
- unicode_range_b in ("Hiragana", "Katakana"),
- )
- if (range_a_jp_chars or range_b_jp_chars) and (
- "CJK" in unicode_range_a or "CJK" in unicode_range_b
- ):
- return False
- if range_a_jp_chars and range_b_jp_chars:
- return False
-
- if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
- if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
- return False
- if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
- return False
-
- # Chinese/Japanese use dedicated range for punctuation and/or separators.
- if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
- unicode_range_a in ["Katakana", "Hiragana"]
- and unicode_range_b in ["Katakana", "Hiragana"]
- ):
- if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
- return False
- if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
- return False
-
- return True
-
-
- @lru_cache(maxsize=2048)
- def mess_ratio(
- decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
- ) -> float:
- """
- Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
- """
-
- detectors: List[MessDetectorPlugin] = [
- md_class() for md_class in MessDetectorPlugin.__subclasses__()
- ]
-
- length: int = len(decoded_sequence) + 1
-
- mean_mess_ratio: float = 0.0
-
- if length < 512:
- intermediary_mean_mess_ratio_calc: int = 32
- elif length <= 1024:
- intermediary_mean_mess_ratio_calc = 64
- else:
- intermediary_mean_mess_ratio_calc = 128
-
- for character, index in zip(decoded_sequence + "\n", range(length)):
- for detector in detectors:
- if detector.eligible(character):
- detector.feed(character)
-
- if (
- index > 0 and index % intermediary_mean_mess_ratio_calc == 0
- ) or index == length - 1:
- mean_mess_ratio = sum(dt.ratio for dt in detectors)
-
- if mean_mess_ratio >= maximum_threshold:
- break
-
- if debug:
- logger = getLogger("charset_normalizer")
-
- logger.log(
- TRACE,
- "Mess-detector extended-analysis start. "
- f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
- f"maximum_threshold={maximum_threshold}",
- )
-
- if len(decoded_sequence) > 16:
- logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
- logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
-
- for dt in detectors: # pragma: nocover
- logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
-
- return round(mean_mess_ratio, 3)
|