Development of an internal social media platform with personalised dashboards for students
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

similar.py 14KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. # Copyright (c) 2006, 2008-2014 LOGILAB S.A. (Paris, FRANCE) <contact@logilab.fr>
  2. # Copyright (c) 2012 Ry4an Brase <ry4an-hg@ry4an.org>
  3. # Copyright (c) 2012 Google, Inc.
  4. # Copyright (c) 2012 Anthony VEREZ <anthony.verez.external@cassidian.com>
  5. # Copyright (c) 2014-2017 Claudiu Popa <pcmanticore@gmail.com>
  6. # Copyright (c) 2014 Brett Cannon <brett@python.org>
  7. # Copyright (c) 2014 Arun Persaud <arun@nubati.net>
  8. # Copyright (c) 2015 Ionel Cristian Maries <contact@ionelmc.ro>
  9. # Copyright (c) 2017 Anthony Sottile <asottile@umich.edu>
  10. # Copyright (c) 2017 Mikhail Fesenko <proggga@gmail.com>
  11. # Licensed under the GPL: https://www.gnu.org/licenses/old-licenses/gpl-2.0.html
  12. # For details: https://github.com/PyCQA/pylint/blob/master/COPYING
  13. # pylint: disable=W0622
  14. """a similarities / code duplication command line tool and pylint checker
  15. """
  16. from __future__ import print_function
  17. import sys
  18. from collections import defaultdict
  19. import six
  20. from six.moves import zip
  21. from pylint.utils import decoding_stream
  22. from pylint.interfaces import IRawChecker
  23. from pylint.checkers import BaseChecker, table_lines_from_stats
  24. from pylint.reporters.ureports.nodes import Table
  25. class Similar(object):
  26. """finds copy-pasted lines of code in a project"""
  27. def __init__(self, min_lines=4, ignore_comments=False,
  28. ignore_docstrings=False, ignore_imports=False):
  29. self.min_lines = min_lines
  30. self.ignore_comments = ignore_comments
  31. self.ignore_docstrings = ignore_docstrings
  32. self.ignore_imports = ignore_imports
  33. self.linesets = []
  34. def append_stream(self, streamid, stream, encoding=None):
  35. """append a file to search for similarities"""
  36. if encoding is None:
  37. readlines = stream.readlines
  38. else:
  39. readlines = decoding_stream(stream, encoding).readlines
  40. try:
  41. self.linesets.append(LineSet(streamid,
  42. readlines(),
  43. self.ignore_comments,
  44. self.ignore_docstrings,
  45. self.ignore_imports))
  46. except UnicodeDecodeError:
  47. pass
  48. def run(self):
  49. """start looking for similarities and display results on stdout"""
  50. self._display_sims(self._compute_sims())
  51. def _compute_sims(self):
  52. """compute similarities in appended files"""
  53. no_duplicates = defaultdict(list)
  54. for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
  55. duplicate = no_duplicates[num]
  56. for couples in duplicate:
  57. if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
  58. couples.add((lineset1, idx1))
  59. couples.add((lineset2, idx2))
  60. break
  61. else:
  62. duplicate.append(set([(lineset1, idx1), (lineset2, idx2)]))
  63. sims = []
  64. for num, ensembles in six.iteritems(no_duplicates):
  65. for couples in ensembles:
  66. sims.append((num, couples))
  67. sims.sort()
  68. sims.reverse()
  69. return sims
  70. def _display_sims(self, sims):
  71. """display computed similarities on stdout"""
  72. nb_lignes_dupliquees = 0
  73. for num, couples in sims:
  74. print()
  75. print(num, "similar lines in", len(couples), "files")
  76. couples = sorted(couples)
  77. for lineset, idx in couples:
  78. print("==%s:%s" % (lineset.name, idx))
  79. # pylint: disable=W0631
  80. for line in lineset._real_lines[idx:idx+num]:
  81. print(" ", line.rstrip())
  82. nb_lignes_dupliquees += num * (len(couples)-1)
  83. nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
  84. print("TOTAL lines=%s duplicates=%s percent=%.2f" \
  85. % (nb_total_lignes, nb_lignes_dupliquees,
  86. nb_lignes_dupliquees*100. / nb_total_lignes))
  87. def _find_common(self, lineset1, lineset2):
  88. """find similarities in the two given linesets"""
  89. lines1 = lineset1.enumerate_stripped
  90. lines2 = lineset2.enumerate_stripped
  91. find = lineset2.find
  92. index1 = 0
  93. min_lines = self.min_lines
  94. while index1 < len(lineset1):
  95. skip = 1
  96. num = 0
  97. for index2 in find(lineset1[index1]):
  98. non_blank = 0
  99. for num, ((_, line1), (_, line2)) in enumerate(
  100. zip(lines1(index1), lines2(index2))):
  101. if line1 != line2:
  102. if non_blank > min_lines:
  103. yield num, lineset1, index1, lineset2, index2
  104. skip = max(skip, num)
  105. break
  106. if line1:
  107. non_blank += 1
  108. else:
  109. # we may have reach the end
  110. num += 1
  111. if non_blank > min_lines:
  112. yield num, lineset1, index1, lineset2, index2
  113. skip = max(skip, num)
  114. index1 += skip
  115. def _iter_sims(self):
  116. """iterate on similarities among all files, by making a cartesian
  117. product
  118. """
  119. for idx, lineset in enumerate(self.linesets[:-1]):
  120. for lineset2 in self.linesets[idx+1:]:
  121. for sim in self._find_common(lineset, lineset2):
  122. yield sim
  123. def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
  124. """return lines with leading/trailing whitespace and any ignored code
  125. features removed
  126. """
  127. strippedlines = []
  128. docstring = None
  129. for line in lines:
  130. line = line.strip()
  131. if ignore_docstrings:
  132. if not docstring and \
  133. (line.startswith('"""') or line.startswith("'''")):
  134. docstring = line[:3]
  135. line = line[3:]
  136. if docstring:
  137. if line.endswith(docstring):
  138. docstring = None
  139. line = ''
  140. if ignore_imports:
  141. if line.startswith("import ") or line.startswith("from "):
  142. line = ''
  143. if ignore_comments:
  144. # XXX should use regex in checkers/format to avoid cutting
  145. # at a "#" in a string
  146. line = line.split('#', 1)[0].strip()
  147. strippedlines.append(line)
  148. return strippedlines
  149. class LineSet(object):
  150. """Holds and indexes all the lines of a single source file"""
  151. def __init__(self, name, lines, ignore_comments=False,
  152. ignore_docstrings=False, ignore_imports=False):
  153. self.name = name
  154. self._real_lines = lines
  155. self._stripped_lines = stripped_lines(lines, ignore_comments,
  156. ignore_docstrings,
  157. ignore_imports)
  158. self._index = self._mk_index()
  159. def __str__(self):
  160. return '<Lineset for %s>' % self.name
  161. def __len__(self):
  162. return len(self._real_lines)
  163. def __getitem__(self, index):
  164. return self._stripped_lines[index]
  165. def __lt__(self, other):
  166. return self.name < other.name
  167. def __hash__(self):
  168. return id(self)
  169. def enumerate_stripped(self, start_at=0):
  170. """return an iterator on stripped lines, starting from a given index
  171. if specified, else 0
  172. """
  173. idx = start_at
  174. if start_at:
  175. lines = self._stripped_lines[start_at:]
  176. else:
  177. lines = self._stripped_lines
  178. for line in lines:
  179. #if line:
  180. yield idx, line
  181. idx += 1
  182. def find(self, stripped_line):
  183. """return positions of the given stripped line in this set"""
  184. return self._index.get(stripped_line, ())
  185. def _mk_index(self):
  186. """create the index for this set"""
  187. index = defaultdict(list)
  188. for line_no, line in enumerate(self._stripped_lines):
  189. if line:
  190. index[line].append(line_no)
  191. return index
  192. MSGS = {'R0801': ('Similar lines in %s files\n%s',
  193. 'duplicate-code',
  194. 'Indicates that a set of similar lines has been detected \
  195. among multiple file. This usually means that the code should \
  196. be refactored to avoid this duplication.')}
  197. def report_similarities(sect, stats, old_stats):
  198. """make a layout with some stats about duplication"""
  199. lines = ['', 'now', 'previous', 'difference']
  200. lines += table_lines_from_stats(stats, old_stats,
  201. ('nb_duplicated_lines',
  202. 'percent_duplicated_lines'))
  203. sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))
  204. # wrapper to get a pylint checker from the similar class
  205. class SimilarChecker(BaseChecker, Similar):
  206. """checks for similarities and duplicated code. This computation may be
  207. memory / CPU intensive, so you should disable it if you experiment some
  208. problems.
  209. """
  210. __implements__ = (IRawChecker,)
  211. # configuration section name
  212. name = 'similarities'
  213. # messages
  214. msgs = MSGS
  215. # configuration options
  216. # for available dict keys/values see the optik parser 'add_option' method
  217. options = (('min-similarity-lines',
  218. {'default' : 4, 'type' : "int", 'metavar' : '<int>',
  219. 'help' : 'Minimum lines number of a similarity.'}),
  220. ('ignore-comments',
  221. {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
  222. 'help': 'Ignore comments when computing similarities.'}
  223. ),
  224. ('ignore-docstrings',
  225. {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
  226. 'help': 'Ignore docstrings when computing similarities.'}
  227. ),
  228. ('ignore-imports',
  229. {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
  230. 'help': 'Ignore imports when computing similarities.'}
  231. ),
  232. )
  233. # reports
  234. reports = (('RP0801', 'Duplication', report_similarities),)
  235. def __init__(self, linter=None):
  236. BaseChecker.__init__(self, linter)
  237. Similar.__init__(self, min_lines=4,
  238. ignore_comments=True, ignore_docstrings=True)
  239. self.stats = None
  240. def set_option(self, optname, value, action=None, optdict=None):
  241. """method called to set an option (registered in the options list)
  242. overridden to report options setting to Similar
  243. """
  244. BaseChecker.set_option(self, optname, value, action, optdict)
  245. if optname == 'min-similarity-lines':
  246. self.min_lines = self.config.min_similarity_lines
  247. elif optname == 'ignore-comments':
  248. self.ignore_comments = self.config.ignore_comments
  249. elif optname == 'ignore-docstrings':
  250. self.ignore_docstrings = self.config.ignore_docstrings
  251. elif optname == 'ignore-imports':
  252. self.ignore_imports = self.config.ignore_imports
  253. def open(self):
  254. """init the checkers: reset linesets and statistics information"""
  255. self.linesets = []
  256. self.stats = self.linter.add_stats(nb_duplicated_lines=0,
  257. percent_duplicated_lines=0)
  258. def process_module(self, node):
  259. """process a module
  260. the module's content is accessible via the stream object
  261. stream must implement the readlines method
  262. """
  263. with node.stream() as stream:
  264. self.append_stream(self.linter.current_name,
  265. stream,
  266. node.file_encoding)
  267. def close(self):
  268. """compute and display similarities on closing (i.e. end of parsing)"""
  269. total = sum(len(lineset) for lineset in self.linesets)
  270. duplicated = 0
  271. stats = self.stats
  272. for num, couples in self._compute_sims():
  273. msg = []
  274. for lineset, idx in couples:
  275. msg.append("==%s:%s" % (lineset.name, idx))
  276. msg.sort()
  277. # pylint: disable=W0631
  278. for line in lineset._real_lines[idx:idx+num]:
  279. msg.append(line.rstrip())
  280. self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
  281. duplicated += num * (len(couples) - 1)
  282. stats['nb_duplicated_lines'] = duplicated
  283. stats['percent_duplicated_lines'] = total and duplicated * 100. / total
  284. def register(linter):
  285. """required method to auto register this checker """
  286. linter.register_checker(SimilarChecker(linter))
  287. def usage(status=0):
  288. """display command line usage information"""
  289. print("finds copy pasted blocks in a set of files")
  290. print()
  291. print('Usage: symilar [-d|--duplicates min_duplicated_lines] \
  292. [-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...')
  293. sys.exit(status)
  294. def Run(argv=None):
  295. """standalone command line access point"""
  296. if argv is None:
  297. argv = sys.argv[1:]
  298. from getopt import getopt
  299. s_opts = 'hdi'
  300. l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
  301. 'ignore-docstrings')
  302. min_lines = 4
  303. ignore_comments = False
  304. ignore_docstrings = False
  305. ignore_imports = False
  306. opts, args = getopt(argv, s_opts, l_opts)
  307. for opt, val in opts:
  308. if opt in ('-d', '--duplicates'):
  309. min_lines = int(val)
  310. elif opt in ('-h', '--help'):
  311. usage()
  312. elif opt in ('-i', '--ignore-comments'):
  313. ignore_comments = True
  314. elif opt in ('--ignore-docstrings',):
  315. ignore_docstrings = True
  316. elif opt in ('--ignore-imports',):
  317. ignore_imports = True
  318. if not args:
  319. usage(1)
  320. sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
  321. for filename in args:
  322. with open(filename) as stream:
  323. sim.append_stream(filename, stream)
  324. sim.run()
  325. sys.exit(0)
  326. if __name__ == '__main__':
  327. Run()