Funktionierender Prototyp des Serious Games zur Vermittlung von Wissen zu Software-Engineering-Arbeitsmodellen.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

jslex.py 7.9KB

1 year ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. """JsLex: a lexer for JavaScript"""
  2. # Originally from https://bitbucket.org/ned/jslex
  3. import re
  4. class Tok:
  5. """
  6. A specification for a token class.
  7. """
  8. num = 0
  9. def __init__(self, name, regex, next=None):
  10. self.id = Tok.num
  11. Tok.num += 1
  12. self.name = name
  13. self.regex = regex
  14. self.next = next
  15. def literals(choices, prefix="", suffix=""):
  16. """
  17. Create a regex from a space-separated list of literal `choices`.
  18. If provided, `prefix` and `suffix` will be attached to each choice
  19. individually.
  20. """
  21. return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
  22. class Lexer:
  23. """
  24. A generic multi-state regex-based lexer.
  25. """
  26. def __init__(self, states, first):
  27. self.regexes = {}
  28. self.toks = {}
  29. for state, rules in states.items():
  30. parts = []
  31. for tok in rules:
  32. groupid = "t%d" % tok.id
  33. self.toks[groupid] = tok
  34. parts.append("(?P<%s>%s)" % (groupid, tok.regex))
  35. self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE)
  36. self.state = first
  37. def lex(self, text):
  38. """
  39. Lexically analyze `text`.
  40. Yield pairs (`name`, `tokentext`).
  41. """
  42. end = len(text)
  43. state = self.state
  44. regexes = self.regexes
  45. toks = self.toks
  46. start = 0
  47. while start < end:
  48. for match in regexes[state].finditer(text, start):
  49. name = match.lastgroup
  50. tok = toks[name]
  51. toktext = match[name]
  52. start += len(toktext)
  53. yield (tok.name, toktext)
  54. if tok.next:
  55. state = tok.next
  56. break
  57. self.state = state
  58. class JsLexer(Lexer):
  59. """
  60. A JavaScript lexer
  61. >>> lexer = JsLexer()
  62. >>> list(lexer.lex("a = 1"))
  63. [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
  64. This doesn't properly handle non-ASCII characters in the JavaScript source.
  65. """
  66. # Because these tokens are matched as alternatives in a regex, longer
  67. # possibilities must appear in the list before shorter ones, for example,
  68. # '>>' before '>'.
  69. #
  70. # Note that we don't have to detect malformed JavaScript, only properly
  71. # lex correct JavaScript, so much of this is simplified.
  72. # Details of JavaScript lexical structure are taken from
  73. # https://www.ecma-international.org/publications-and-standards/standards/ecma-262/
  74. # A useful explanation of automatic semicolon insertion is at
  75. # http://inimino.org/~inimino/blog/javascript_semicolons
  76. both_before = [
  77. Tok("comment", r"/\*(.|\n)*?\*/"),
  78. Tok("linecomment", r"//.*?$"),
  79. Tok("ws", r"\s+"),
  80. Tok(
  81. "keyword",
  82. literals(
  83. """
  84. break case catch class const continue debugger
  85. default delete do else enum export extends
  86. finally for function if import in instanceof
  87. new return super switch this throw try typeof
  88. var void while with
  89. """,
  90. suffix=r"\b",
  91. ),
  92. next="reg",
  93. ),
  94. Tok("reserved", literals("null true false", suffix=r"\b"), next="div"),
  95. Tok(
  96. "id",
  97. r"""
  98. ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
  99. ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
  100. """,
  101. next="div",
  102. ),
  103. Tok("hnum", r"0[xX][0-9a-fA-F]+", next="div"),
  104. Tok("onum", r"0[0-7]+"),
  105. Tok(
  106. "dnum",
  107. r"""
  108. ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
  109. \. # dot
  110. [0-9]* # DecimalDigits-opt
  111. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  112. |
  113. \. # dot
  114. [0-9]+ # DecimalDigits
  115. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  116. |
  117. (0|[1-9][0-9]*) # DecimalIntegerLiteral
  118. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  119. )
  120. """,
  121. next="div",
  122. ),
  123. Tok(
  124. "punct",
  125. literals(
  126. """
  127. >>>= === !== >>> <<= >>= <= >= == != << >> &&
  128. || += -= *= %= &= |= ^=
  129. """
  130. ),
  131. next="reg",
  132. ),
  133. Tok("punct", literals("++ -- ) ]"), next="div"),
  134. Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next="reg"),
  135. Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next="div"),
  136. Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next="div"),
  137. ]
  138. both_after = [
  139. Tok("other", r"."),
  140. ]
  141. states = {
  142. # slash will mean division
  143. "div": both_before
  144. + [
  145. Tok("punct", literals("/= /"), next="reg"),
  146. ]
  147. + both_after,
  148. # slash will mean regex
  149. "reg": both_before
  150. + [
  151. Tok(
  152. "regex",
  153. r"""
  154. / # opening slash
  155. # First character is..
  156. ( [^*\\/[] # anything but * \ / or [
  157. | \\. # or an escape sequence
  158. | \[ # or a class, which has
  159. ( [^\]\\] # anything but \ or ]
  160. | \\. # or an escape sequence
  161. )* # many times
  162. \]
  163. )
  164. # Following characters are same, except for excluding a star
  165. ( [^\\/[] # anything but \ / or [
  166. | \\. # or an escape sequence
  167. | \[ # or a class, which has
  168. ( [^\]\\] # anything but \ or ]
  169. | \\. # or an escape sequence
  170. )* # many times
  171. \]
  172. )* # many times
  173. / # closing slash
  174. [a-zA-Z0-9]* # trailing flags
  175. """,
  176. next="div",
  177. ),
  178. ]
  179. + both_after,
  180. }
  181. def __init__(self):
  182. super().__init__(self.states, "reg")
  183. def prepare_js_for_gettext(js):
  184. """
  185. Convert the JavaScript source `js` into something resembling C for
  186. xgettext.
  187. What actually happens is that all the regex literals are replaced with
  188. "REGEX".
  189. """
  190. def escape_quotes(m):
  191. """Used in a regex to properly escape double quotes."""
  192. s = m[0]
  193. if s == '"':
  194. return r"\""
  195. else:
  196. return s
  197. lexer = JsLexer()
  198. c = []
  199. for name, tok in lexer.lex(js):
  200. if name == "regex":
  201. # C doesn't grok regexes, and they aren't needed for gettext,
  202. # so just output a string instead.
  203. tok = '"REGEX"'
  204. elif name == "string":
  205. # C doesn't have single-quoted strings, so make all strings
  206. # double-quoted.
  207. if tok.startswith("'"):
  208. guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
  209. tok = '"' + guts + '"'
  210. elif name == "id":
  211. # C can't deal with Unicode escapes in identifiers. We don't
  212. # need them for gettext anyway, so replace them with something
  213. # innocuous
  214. tok = tok.replace("\\", "U")
  215. c.append(tok)
  216. return "".join(c)