Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

spelling.py 13KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357
  1. # -*- coding: utf-8 -*-
  2. # Copyright (c) 2014-2017 Claudiu Popa <pcmanticore@gmail.com>
  3. # Copyright (c) 2014 Michal Nowikowski <godfryd@gmail.com>
  4. # Copyright (c) 2014 LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
  5. # Copyright (c) 2015 Pavel Roskin <proski@gnu.org>
  6. # Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro>
  7. # Copyright (c) 2016-2017 Pedro Algarvio <pedro@algarvio.me>
  8. # Copyright (c) 2016 Alexander Todorov <atodorov@otb.bg>
  9. # Copyright (c) 2017 Łukasz Rogalski <rogalski.91@gmail.com>
  10. # Copyright (c) 2017 Mikhail Fesenko <proggga@gmail.com>
  11. # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
  12. # For details: https://github.com/PyCQA/pylint/blob/master/COPYING
  13. """Checker for spelling errors in comments and docstrings.
  14. """
  15. import os
  16. import tokenize
  17. import re
  18. try:
  19. import enchant
  20. from enchant.tokenize import (get_tokenizer,
  21. Chunker,
  22. Filter,
  23. EmailFilter,
  24. URLFilter,
  25. WikiWordFilter)
  26. except ImportError:
  27. enchant = None
  28. # pylint: disable=old-style-class,no-init
  29. class Filter:
  30. def _skip(self, word):
  31. raise NotImplementedError
  32. class Chunker:
  33. pass
  34. import six
  35. from pylint.interfaces import ITokenChecker, IAstroidChecker
  36. from pylint.checkers import BaseTokenChecker
  37. from pylint.checkers.utils import check_messages
  38. from pylint.utils import safe_decode
  39. if enchant is not None:
  40. br = enchant.Broker()
  41. dicts = br.list_dicts()
  42. dict_choices = [''] + [d[0] for d in dicts]
  43. dicts = ["%s (%s)" % (d[0], d[1].name) for d in dicts]
  44. dicts = ", ".join(dicts)
  45. instr = ""
  46. else:
  47. dicts = "none"
  48. dict_choices = ['']
  49. instr = " To make it working install python-enchant package."
  50. class WordsWithDigigtsFilter(Filter):
  51. """Skips words with digits.
  52. """
  53. def _skip(self, word):
  54. for char in word:
  55. if char.isdigit():
  56. return True
  57. return False
  58. class WordsWithUnderscores(Filter):
  59. """Skips words with underscores.
  60. They are probably function parameter names.
  61. """
  62. def _skip(self, word):
  63. return '_' in word
  64. class CamelCasedWord(Filter):
  65. r"""Filter skipping over camelCasedWords.
  66. This filter skips any words matching the following regular expression:
  67. ^([a-z]\w+[A-Z]+\w+)
  68. That is, any words that are camelCasedWords.
  69. """
  70. _pattern = re.compile(r"^([a-z]+([\d]|[A-Z])(?:\w+)?)")
  71. def _skip(self, word):
  72. return bool(self._pattern.match(word))
  73. class SphinxDirectives(Filter):
  74. r"""Filter skipping over Sphinx Directives.
  75. This filter skips any words matching the following regular expression:
  76. ^:([a-z]+):`([^`]+)(`)?
  77. That is, for example, :class:`BaseQuery`
  78. """
  79. # The final ` in the pattern is optional because enchant strips it out
  80. _pattern = re.compile(r"^:([a-z]+):`([^`]+)(`)?")
  81. def _skip(self, word):
  82. return bool(self._pattern.match(word))
  83. class ForwardSlashChunkder(Chunker):
  84. '''
  85. This chunker allows splitting words like 'before/after' into 'before' and 'after'
  86. '''
  87. def next(self):
  88. while True:
  89. if not self._text:
  90. raise StopIteration()
  91. if '/' not in self._text:
  92. text = self._text
  93. self._offset = 0
  94. self._text = ''
  95. return (text, 0)
  96. pre_text, post_text = self._text.split('/', 1)
  97. self._text = post_text
  98. self._offset = 0
  99. if not pre_text or not post_text or \
  100. not pre_text[-1].isalpha() or not post_text[0].isalpha():
  101. self._text = ''
  102. self._offset = 0
  103. return (pre_text + '/' + post_text, 0)
  104. return (pre_text, 0)
  105. def _next(self):
  106. while True:
  107. if '/' not in self._text:
  108. return (self._text, 0)
  109. pre_text, post_text = self._text.split('/', 1)
  110. if not pre_text or not post_text:
  111. break
  112. if not pre_text[-1].isalpha() or not post_text[0].isalpha():
  113. raise StopIteration()
  114. self._text = pre_text + ' ' + post_text
  115. raise StopIteration()
  116. class SpellingChecker(BaseTokenChecker):
  117. """Check spelling in comments and docstrings"""
  118. __implements__ = (ITokenChecker, IAstroidChecker)
  119. name = 'spelling'
  120. msgs = {
  121. 'C0401': ('Wrong spelling of a word \'%s\' in a comment:\n%s\n'
  122. '%s\nDid you mean: \'%s\'?',
  123. 'wrong-spelling-in-comment',
  124. 'Used when a word in comment is not spelled correctly.'),
  125. 'C0402': ('Wrong spelling of a word \'%s\' in a docstring:\n%s\n'
  126. '%s\nDid you mean: \'%s\'?',
  127. 'wrong-spelling-in-docstring',
  128. 'Used when a word in docstring is not spelled correctly.'),
  129. 'C0403': ('Invalid characters %r in a docstring',
  130. 'invalid-characters-in-docstring',
  131. 'Used when a word in docstring cannot be checked by enchant.'),
  132. }
  133. options = (('spelling-dict',
  134. {'default' : '', 'type' : 'choice', 'metavar' : '<dict name>',
  135. 'choices': dict_choices,
  136. 'help' : 'Spelling dictionary name. '
  137. 'Available dictionaries: %s.%s' % (dicts, instr)}),
  138. ('spelling-ignore-words',
  139. {'default' : '',
  140. 'type' : 'string',
  141. 'metavar' : '<comma separated words>',
  142. 'help' : 'List of comma separated words that '
  143. 'should not be checked.'}),
  144. ('spelling-private-dict-file',
  145. {'default' : '',
  146. 'type' : 'string',
  147. 'metavar' : '<path to file>',
  148. 'help' : 'A path to a file that contains private '
  149. 'dictionary; one word per line.'}),
  150. ('spelling-store-unknown-words',
  151. {'default' : 'n', 'type' : 'yn', 'metavar' : '<y_or_n>',
  152. 'help' : 'Tells whether to store unknown words to '
  153. 'indicated private dictionary in '
  154. '--spelling-private-dict-file option instead of '
  155. 'raising a message.'}),
  156. ('max-spelling-suggestions',
  157. {'default': 4, 'type': 'int', 'metavar': 'N',
  158. 'help': 'Limits count of emitted suggestions for '
  159. 'spelling mistakes'}),
  160. )
  161. def open(self):
  162. self.initialized = False
  163. self.private_dict_file = None
  164. if enchant is None:
  165. return
  166. dict_name = self.config.spelling_dict
  167. if not dict_name:
  168. return
  169. self.ignore_list = [w.strip() for w in self.config.spelling_ignore_words.split(",")]
  170. # "param" appears in docstring in param description and
  171. # "pylint" appears in comments in pylint pragmas.
  172. self.ignore_list.extend(["param", "pylint"])
  173. # Expand tilde to allow e.g. spelling-private-dict-file = ~/.pylintdict
  174. if self.config.spelling_private_dict_file:
  175. self.config.spelling_private_dict_file = os.path.expanduser(
  176. self.config.spelling_private_dict_file)
  177. if self.config.spelling_private_dict_file:
  178. self.spelling_dict = enchant.DictWithPWL(
  179. dict_name, self.config.spelling_private_dict_file)
  180. self.private_dict_file = open(
  181. self.config.spelling_private_dict_file, "a")
  182. else:
  183. self.spelling_dict = enchant.Dict(dict_name)
  184. if self.config.spelling_store_unknown_words:
  185. self.unknown_words = set()
  186. self.tokenizer = get_tokenizer(dict_name,
  187. chunkers=[ForwardSlashChunkder],
  188. filters=[EmailFilter,
  189. URLFilter,
  190. WikiWordFilter,
  191. WordsWithDigigtsFilter,
  192. WordsWithUnderscores,
  193. CamelCasedWord,
  194. SphinxDirectives])
  195. self.initialized = True
  196. def close(self):
  197. if self.private_dict_file:
  198. self.private_dict_file.close()
  199. def _check_spelling(self, msgid, line, line_num):
  200. original_line = line
  201. if line.strip().startswith('#'):
  202. line = line.strip()[1:]
  203. starts_with_comment = True
  204. else:
  205. starts_with_comment = False
  206. for word, _ in self.tokenizer(line.strip()):
  207. if six.PY2:
  208. lower_cased_word = word.lower()
  209. else:
  210. lower_cased_word = word.casefold()
  211. # Skip words from ignore list.
  212. if word in self.ignore_list or lower_cased_word in self.ignore_list:
  213. continue
  214. # Strip starting u' from unicode literals and r' from raw strings.
  215. if word.startswith(("u'", 'u"', "r'", 'r"')) and len(word) > 2:
  216. word = word[2:]
  217. lower_cased_word = lower_cased_word[2:]
  218. # If it is a known word, then continue.
  219. try:
  220. if self.spelling_dict.check(lower_cased_word):
  221. # The lower cased version of word passed spell checking
  222. continue
  223. # If we reached this far, it means there was a spelling mistake.
  224. # Let's retry with the original work because 'unicode' is a
  225. # spelling mistake but 'Unicode' is not
  226. if self.spelling_dict.check(word):
  227. continue
  228. except enchant.errors.Error:
  229. self.add_message('invalid-characters-in-docstring',
  230. line=line_num, args=(word,))
  231. continue
  232. # Store word to private dict or raise a message.
  233. if self.config.spelling_store_unknown_words:
  234. if lower_cased_word not in self.unknown_words:
  235. self.private_dict_file.write("%s\n" % lower_cased_word)
  236. self.unknown_words.add(lower_cased_word)
  237. else:
  238. # Present up to N suggestions.
  239. suggestions = self.spelling_dict.suggest(word)
  240. del suggestions[self.config.max_spelling_suggestions:]
  241. m = re.search(r"(\W|^)(%s)(\W|$)" % word, line)
  242. if m:
  243. # Start position of second group in regex.
  244. col = m.regs[2][0]
  245. else:
  246. col = line.index(word)
  247. if starts_with_comment:
  248. col += 1
  249. indicator = (" " * col) + ("^" * len(word))
  250. self.add_message(msgid, line=line_num,
  251. args=(word, original_line,
  252. indicator,
  253. "'{0}'".format("' or '".join(suggestions))))
  254. def process_tokens(self, tokens):
  255. if not self.initialized:
  256. return
  257. # Process tokens and look for comments.
  258. for (tok_type, token, (start_row, _), _, _) in tokens:
  259. if tok_type == tokenize.COMMENT:
  260. if start_row == 1 and token.startswith('#!/'):
  261. # Skip shebang lines
  262. continue
  263. if token.startswith('# pylint:'):
  264. # Skip pylint enable/disable comments
  265. continue
  266. self._check_spelling('wrong-spelling-in-comment',
  267. token, start_row)
  268. @check_messages('wrong-spelling-in-docstring')
  269. def visit_module(self, node):
  270. if not self.initialized:
  271. return
  272. self._check_docstring(node)
  273. @check_messages('wrong-spelling-in-docstring')
  274. def visit_classdef(self, node):
  275. if not self.initialized:
  276. return
  277. self._check_docstring(node)
  278. @check_messages('wrong-spelling-in-docstring')
  279. def visit_functiondef(self, node):
  280. if not self.initialized:
  281. return
  282. self._check_docstring(node)
  283. visit_asyncfunctiondef = visit_functiondef
  284. def _check_docstring(self, node):
  285. """check the node has any spelling errors"""
  286. docstring = node.doc
  287. if not docstring:
  288. return
  289. start_line = node.lineno + 1
  290. if six.PY2:
  291. encoding = node.root().file_encoding
  292. docstring = safe_decode(docstring, encoding, 'replace')
  293. # Go through lines of docstring
  294. for idx, line in enumerate(docstring.splitlines()):
  295. self._check_spelling('wrong-spelling-in-docstring',
  296. line, start_line + idx)
  297. def register(linter):
  298. """required method to auto register this checker """
  299. linter.register_checker(SpellingChecker(linter))