Funktionierender Prototyp des Serious Games zur Vermittlung von Wissen zu Software-Engineering-Arbeitsmodellen.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

PyParse.py 18KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. import re
  2. import string
  3. import sys
  4. # Reason last stmt is continued (or C_NONE if it's not).
  5. C_NONE, C_BACKSLASH, C_STRING, C_BRACKET = list(range(4))
  6. if 0: # for throwaway debugging output
  7. def dump(*stuff):
  8. sys.__stdout__.write(" ".join(map(str, stuff)) + "\n")
  9. # Find what looks like the start of a popular stmt.
  10. _synchre = re.compile(
  11. r"""
  12. ^
  13. [ \t]*
  14. (?: if
  15. | for
  16. | while
  17. | else
  18. | def
  19. | return
  20. | assert
  21. | break
  22. | class
  23. | continue
  24. | elif
  25. | try
  26. | except
  27. | raise
  28. | import
  29. )
  30. \b
  31. """,
  32. re.VERBOSE | re.MULTILINE,
  33. ).search
  34. # Match blank line or non-indenting comment line.
  35. _junkre = re.compile(
  36. r"""
  37. [ \t]*
  38. (?: \# \S .* )?
  39. \n
  40. """,
  41. re.VERBOSE,
  42. ).match
  43. # Match any flavor of string; the terminating quote is optional
  44. # so that we're robust in the face of incomplete program text.
  45. _match_stringre = re.compile(
  46. r"""
  47. \""" [^"\\]* (?:
  48. (?: \\. | "(?!"") )
  49. [^"\\]*
  50. )*
  51. (?: \""" )?
  52. | " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
  53. | ''' [^'\\]* (?:
  54. (?: \\. | '(?!'') )
  55. [^'\\]*
  56. )*
  57. (?: ''' )?
  58. | ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
  59. """,
  60. re.VERBOSE | re.DOTALL,
  61. ).match
  62. # Match a line that starts with something interesting;
  63. # used to find the first item of a bracket structure.
  64. _itemre = re.compile(
  65. r"""
  66. [ \t]*
  67. [^\s#\\] # if we match, m.end()-1 is the interesting char
  68. """,
  69. re.VERBOSE,
  70. ).match
  71. # Match start of stmts that should be followed by a dedent.
  72. _closere = re.compile(
  73. r"""
  74. \s*
  75. (?: return
  76. | break
  77. | continue
  78. | raise
  79. | pass
  80. )
  81. \b
  82. """,
  83. re.VERBOSE,
  84. ).match
  85. # Chew up non-special chars as quickly as possible. If match is
  86. # successful, m.end() less 1 is the index of the last boring char
  87. # matched. If match is unsuccessful, the string starts with an
  88. # interesting char.
  89. _chew_ordinaryre = re.compile(
  90. r"""
  91. [^[\](){}#'"\\]+
  92. """,
  93. re.VERBOSE,
  94. ).match
  95. # Build translation table to map uninteresting chars to "x", open
  96. # brackets to "(", and close brackets to ")".
  97. _tran = ["x"] * 256
  98. for ch in "({[":
  99. _tran[ord(ch)] = "("
  100. for ch in ")}]":
  101. _tran[ord(ch)] = ")"
  102. for ch in "\"'\\\n#":
  103. _tran[ord(ch)] = ch
  104. # We are called with unicode strings, and str.translate is one of the few
  105. # py2k functions which can't 'do the right thing' - so take care to ensure
  106. # _tran is full of unicode...
  107. _tran = "".join(_tran)
  108. del ch
  109. class Parser:
  110. def __init__(self, indentwidth, tabwidth):
  111. self.indentwidth = indentwidth
  112. self.tabwidth = tabwidth
  113. def set_str(self, str):
  114. assert len(str) == 0 or str[-1] == "\n", "Oops - have str %r" % (str,)
  115. self.str = str
  116. self.study_level = 0
  117. # Return index of a good place to begin parsing, as close to the
  118. # end of the string as possible. This will be the start of some
  119. # popular stmt like "if" or "def". Return None if none found:
  120. # the caller should pass more prior context then, if possible, or
  121. # if not (the entire program text up until the point of interest
  122. # has already been tried) pass 0 to set_lo.
  123. #
  124. # This will be reliable iff given a reliable is_char_in_string
  125. # function, meaning that when it says "no", it's absolutely
  126. # guaranteed that the char is not in a string.
  127. #
  128. # Ack, hack: in the shell window this kills us, because there's
  129. # no way to tell the differences between output, >>> etc and
  130. # user input. Indeed, IDLE's first output line makes the rest
  131. # look like it's in an unclosed paren!:
  132. # Python 1.5.2 (#0, Apr 13 1999, ...
  133. def find_good_parse_start(self, use_ps1, is_char_in_string=None):
  134. str, pos = self.str, None
  135. if use_ps1:
  136. # shell window
  137. ps1 = "\n" + sys.ps1
  138. i = str.rfind(ps1)
  139. if i >= 0:
  140. pos = i + len(ps1)
  141. # make it look like there's a newline instead
  142. # of ps1 at the start -- hacking here once avoids
  143. # repeated hackery later
  144. self.str = str[: pos - 1] + "\n" + str[pos:]
  145. return pos
  146. # File window -- real work.
  147. if not is_char_in_string:
  148. # no clue -- make the caller pass everything
  149. return None
  150. # Peek back from the end for a good place to start,
  151. # but don't try too often; pos will be left None, or
  152. # bumped to a legitimate synch point.
  153. limit = len(str)
  154. for tries in range(5):
  155. i = str.rfind(":\n", 0, limit)
  156. if i < 0:
  157. break
  158. i = str.rfind("\n", 0, i) + 1 # start of colon line
  159. m = _synchre(str, i, limit)
  160. if m and not is_char_in_string(m.start()):
  161. pos = m.start()
  162. break
  163. limit = i
  164. if pos is None:
  165. # Nothing looks like a block-opener, or stuff does
  166. # but is_char_in_string keeps returning true; most likely
  167. # we're in or near a giant string, the colorizer hasn't
  168. # caught up enough to be helpful, or there simply *aren't*
  169. # any interesting stmts. In any of these cases we're
  170. # going to have to parse the whole thing to be sure, so
  171. # give it one last try from the start, but stop wasting
  172. # time here regardless of the outcome.
  173. m = _synchre(str)
  174. if m and not is_char_in_string(m.start()):
  175. pos = m.start()
  176. return pos
  177. # Peeking back worked; look forward until _synchre no longer
  178. # matches.
  179. i = pos + 1
  180. while 1:
  181. m = _synchre(str, i)
  182. if m:
  183. s, i = m.span()
  184. if not is_char_in_string(s):
  185. pos = s
  186. else:
  187. break
  188. return pos
  189. # Throw away the start of the string. Intended to be called with
  190. # find_good_parse_start's result.
  191. def set_lo(self, lo):
  192. assert lo == 0 or self.str[lo - 1] == "\n"
  193. if lo > 0:
  194. self.str = self.str[lo:]
  195. # As quickly as humanly possible <wink>, find the line numbers (0-
  196. # based) of the non-continuation lines.
  197. # Creates self.{goodlines, continuation}.
  198. def _study1(self):
  199. if self.study_level >= 1:
  200. return
  201. self.study_level = 1
  202. # Map all uninteresting characters to "x", all open brackets
  203. # to "(", all close brackets to ")", then collapse runs of
  204. # uninteresting characters. This can cut the number of chars
  205. # by a factor of 10-40, and so greatly speed the following loop.
  206. str = self.str
  207. str = str.translate(_tran)
  208. str = str.replace("xxxxxxxx", "x")
  209. str = str.replace("xxxx", "x")
  210. str = str.replace("xx", "x")
  211. str = str.replace("xx", "x")
  212. str = str.replace("\nx", "\n")
  213. # note that replacing x\n with \n would be incorrect, because
  214. # x may be preceded by a backslash
  215. # March over the squashed version of the program, accumulating
  216. # the line numbers of non-continued stmts, and determining
  217. # whether & why the last stmt is a continuation.
  218. continuation = C_NONE
  219. level = lno = 0 # level is nesting level; lno is line number
  220. self.goodlines = goodlines = [0]
  221. push_good = goodlines.append
  222. i, n = 0, len(str)
  223. while i < n:
  224. ch = str[i]
  225. i = i + 1
  226. # cases are checked in decreasing order of frequency
  227. if ch == "x":
  228. continue
  229. if ch == "\n":
  230. lno = lno + 1
  231. if level == 0:
  232. push_good(lno)
  233. # else we're in an unclosed bracket structure
  234. continue
  235. if ch == "(":
  236. level = level + 1
  237. continue
  238. if ch == ")":
  239. if level:
  240. level = level - 1
  241. # else the program is invalid, but we can't complain
  242. continue
  243. if ch == '"' or ch == "'":
  244. # consume the string
  245. quote = ch
  246. if str[i - 1 : i + 2] == quote * 3:
  247. quote = quote * 3
  248. w = len(quote) - 1
  249. i = i + w
  250. while i < n:
  251. ch = str[i]
  252. i = i + 1
  253. if ch == "x":
  254. continue
  255. if str[i - 1 : i + w] == quote:
  256. i = i + w
  257. break
  258. if ch == "\n":
  259. lno = lno + 1
  260. if w == 0:
  261. # unterminated single-quoted string
  262. if level == 0:
  263. push_good(lno)
  264. break
  265. continue
  266. if ch == "\\":
  267. assert i < n
  268. if str[i] == "\n":
  269. lno = lno + 1
  270. i = i + 1
  271. continue
  272. # else comment char or paren inside string
  273. else:
  274. # didn't break out of the loop, so we're still
  275. # inside a string
  276. continuation = C_STRING
  277. continue # with outer loop
  278. if ch == "#":
  279. # consume the comment
  280. i = str.find("\n", i)
  281. assert i >= 0
  282. continue
  283. assert ch == "\\"
  284. assert i < n
  285. if str[i] == "\n":
  286. lno = lno + 1
  287. if i + 1 == n:
  288. continuation = C_BACKSLASH
  289. i = i + 1
  290. # The last stmt may be continued for all 3 reasons.
  291. # String continuation takes precedence over bracket
  292. # continuation, which beats backslash continuation.
  293. if continuation != C_STRING and level > 0:
  294. continuation = C_BRACKET
  295. self.continuation = continuation
  296. # Push the final line number as a sentinel value, regardless of
  297. # whether it's continued.
  298. assert (continuation == C_NONE) == (goodlines[-1] == lno)
  299. if goodlines[-1] != lno:
  300. push_good(lno)
  301. def get_continuation_type(self):
  302. self._study1()
  303. return self.continuation
  304. # study1 was sufficient to determine the continuation status,
  305. # but doing more requires looking at every character. study2
  306. # does this for the last interesting statement in the block.
  307. # Creates:
  308. # self.stmt_start, stmt_end
  309. # slice indices of last interesting stmt
  310. # self.lastch
  311. # last non-whitespace character before optional trailing
  312. # comment
  313. # self.lastopenbracketpos
  314. # if continuation is C_BRACKET, index of last open bracket
  315. def _study2(self):
  316. _ws = string.whitespace
  317. if self.study_level >= 2:
  318. return
  319. self._study1()
  320. self.study_level = 2
  321. # Set p and q to slice indices of last interesting stmt.
  322. str, goodlines = self.str, self.goodlines
  323. i = len(goodlines) - 1
  324. p = len(str) # index of newest line
  325. while i:
  326. assert p
  327. # p is the index of the stmt at line number goodlines[i].
  328. # Move p back to the stmt at line number goodlines[i-1].
  329. q = p
  330. for nothing in range(goodlines[i - 1], goodlines[i]):
  331. # tricky: sets p to 0 if no preceding newline
  332. p = str.rfind("\n", 0, p - 1) + 1
  333. # The stmt str[p:q] isn't a continuation, but may be blank
  334. # or a non-indenting comment line.
  335. if _junkre(str, p):
  336. i = i - 1
  337. else:
  338. break
  339. if i == 0:
  340. # nothing but junk!
  341. assert p == 0
  342. q = p
  343. self.stmt_start, self.stmt_end = p, q
  344. # Analyze this stmt, to find the last open bracket (if any)
  345. # and last interesting character (if any).
  346. lastch = ""
  347. stack = [] # stack of open bracket indices
  348. push_stack = stack.append
  349. while p < q:
  350. # suck up all except ()[]{}'"#\\
  351. m = _chew_ordinaryre(str, p, q)
  352. if m:
  353. # we skipped at least one boring char
  354. newp = m.end()
  355. # back up over totally boring whitespace
  356. i = newp - 1 # index of last boring char
  357. while i >= p and str[i] in " \t\n":
  358. i = i - 1
  359. if i >= p:
  360. lastch = str[i]
  361. p = newp
  362. if p >= q:
  363. break
  364. ch = str[p]
  365. if ch in "([{":
  366. push_stack(p)
  367. lastch = ch
  368. p = p + 1
  369. continue
  370. if ch in ")]}":
  371. if stack:
  372. del stack[-1]
  373. lastch = ch
  374. p = p + 1
  375. continue
  376. if ch == '"' or ch == "'":
  377. # consume string
  378. # Note that study1 did this with a Python loop, but
  379. # we use a regexp here; the reason is speed in both
  380. # cases; the string may be huge, but study1 pre-squashed
  381. # strings to a couple of characters per line. study1
  382. # also needed to keep track of newlines, and we don't
  383. # have to.
  384. lastch = ch
  385. p = _match_stringre(str, p, q).end()
  386. continue
  387. if ch == "#":
  388. # consume comment and trailing newline
  389. p = str.find("\n", p, q) + 1
  390. assert p > 0
  391. continue
  392. assert ch == "\\"
  393. p = p + 1 # beyond backslash
  394. assert p < q
  395. if str[p] != "\n":
  396. # the program is invalid, but can't complain
  397. lastch = ch + str[p]
  398. p = p + 1 # beyond escaped char
  399. # end while p < q:
  400. self.lastch = lastch
  401. if stack:
  402. self.lastopenbracketpos = stack[-1]
  403. # Assuming continuation is C_BRACKET, return the number
  404. # of spaces the next line should be indented.
  405. def compute_bracket_indent(self):
  406. self._study2()
  407. assert self.continuation == C_BRACKET
  408. j = self.lastopenbracketpos
  409. str = self.str
  410. n = len(str)
  411. origi = i = str.rfind("\n", 0, j) + 1
  412. j = j + 1 # one beyond open bracket
  413. # find first list item; set i to start of its line
  414. while j < n:
  415. m = _itemre(str, j)
  416. if m:
  417. j = m.end() - 1 # index of first interesting char
  418. extra = 0
  419. break
  420. else:
  421. # this line is junk; advance to next line
  422. i = j = str.find("\n", j) + 1
  423. else:
  424. # nothing interesting follows the bracket;
  425. # reproduce the bracket line's indentation + a level
  426. j = i = origi
  427. while str[j] in " \t":
  428. j = j + 1
  429. extra = self.indentwidth
  430. return len(str[i:j].expandtabs(self.tabwidth)) + extra
  431. # Return number of physical lines in last stmt (whether or not
  432. # it's an interesting stmt! this is intended to be called when
  433. # continuation is C_BACKSLASH).
  434. def get_num_lines_in_stmt(self):
  435. self._study1()
  436. goodlines = self.goodlines
  437. return goodlines[-1] - goodlines[-2]
  438. # Assuming continuation is C_BACKSLASH, return the number of spaces
  439. # the next line should be indented. Also assuming the new line is
  440. # the first one following the initial line of the stmt.
  441. def compute_backslash_indent(self):
  442. self._study2()
  443. assert self.continuation == C_BACKSLASH
  444. str = self.str
  445. i = self.stmt_start
  446. while str[i] in " \t":
  447. i = i + 1
  448. startpos = i
  449. # See whether the initial line starts an assignment stmt; i.e.,
  450. # look for an = operator
  451. endpos = str.find("\n", startpos) + 1
  452. found = level = 0
  453. while i < endpos:
  454. ch = str[i]
  455. if ch in "([{":
  456. level = level + 1
  457. i = i + 1
  458. elif ch in ")]}":
  459. if level:
  460. level = level - 1
  461. i = i + 1
  462. elif ch == '"' or ch == "'":
  463. i = _match_stringre(str, i, endpos).end()
  464. elif ch == "#":
  465. break
  466. elif (
  467. level == 0
  468. and ch == "="
  469. and (i == 0 or str[i - 1] not in "=<>!")
  470. and str[i + 1] != "="
  471. ):
  472. found = 1
  473. break
  474. else:
  475. i = i + 1
  476. if found:
  477. # found a legit =, but it may be the last interesting
  478. # thing on the line
  479. i = i + 1 # move beyond the =
  480. found = re.match(r"\s*\\", str[i:endpos]) is None
  481. if not found:
  482. # oh well ... settle for moving beyond the first chunk
  483. # of non-whitespace chars
  484. i = startpos
  485. while str[i] not in " \t\n":
  486. i = i + 1
  487. return len(str[self.stmt_start : i].expandtabs(self.tabwidth)) + 1
  488. # Return the leading whitespace on the initial line of the last
  489. # interesting stmt.
  490. def get_base_indent_string(self):
  491. self._study2()
  492. i, n = self.stmt_start, self.stmt_end
  493. j = i
  494. str = self.str
  495. while j < n and str[j] in " \t":
  496. j = j + 1
  497. return str[i:j]
  498. # Did the last interesting stmt open a block?
  499. def is_block_opener(self):
  500. self._study2()
  501. return self.lastch == ":"
  502. # Did the last interesting stmt close a block?
  503. def is_block_closer(self):
  504. self._study2()
  505. return _closere(self.str, self.stmt_start) is not None
  506. # index of last open bracket ({[, or None if none
  507. lastopenbracketpos = None
  508. def get_last_open_bracket_pos(self):
  509. self._study2()
  510. return self.lastopenbracketpos