You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

jslex.py 7.5KB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. """JsLex: a lexer for Javascript"""
  2. # Originally from https://bitbucket.org/ned/jslex
  3. import re
  4. class Tok:
  5. """
  6. A specification for a token class.
  7. """
  8. num = 0
  9. def __init__(self, name, regex, next=None):
  10. self.id = Tok.num
  11. Tok.num += 1
  12. self.name = name
  13. self.regex = regex
  14. self.next = next
  15. def literals(choices, prefix="", suffix=""):
  16. """
  17. Create a regex from a space-separated list of literal `choices`.
  18. If provided, `prefix` and `suffix` will be attached to each choice
  19. individually.
  20. """
  21. return "|".join(prefix + re.escape(c) + suffix for c in choices.split())
  22. class Lexer:
  23. """
  24. A generic multi-state regex-based lexer.
  25. """
  26. def __init__(self, states, first):
  27. self.regexes = {}
  28. self.toks = {}
  29. for state, rules in states.items():
  30. parts = []
  31. for tok in rules:
  32. groupid = "t%d" % tok.id
  33. self.toks[groupid] = tok
  34. parts.append("(?P<%s>%s)" % (groupid, tok.regex))
  35. self.regexes[state] = re.compile("|".join(parts), re.MULTILINE | re.VERBOSE)
  36. self.state = first
  37. def lex(self, text):
  38. """
  39. Lexically analyze `text`.
  40. Yield pairs (`name`, `tokentext`).
  41. """
  42. end = len(text)
  43. state = self.state
  44. regexes = self.regexes
  45. toks = self.toks
  46. start = 0
  47. while start < end:
  48. for match in regexes[state].finditer(text, start):
  49. name = match.lastgroup
  50. tok = toks[name]
  51. toktext = match.group(name)
  52. start += len(toktext)
  53. yield (tok.name, toktext)
  54. if tok.next:
  55. state = tok.next
  56. break
  57. self.state = state
  58. class JsLexer(Lexer):
  59. """
  60. A Javascript lexer
  61. >>> lexer = JsLexer()
  62. >>> list(lexer.lex("a = 1"))
  63. [('id', 'a'), ('ws', ' '), ('punct', '='), ('ws', ' '), ('dnum', '1')]
  64. This doesn't properly handle non-ASCII characters in the Javascript source.
  65. """
  66. # Because these tokens are matched as alternatives in a regex, longer
  67. # possibilities must appear in the list before shorter ones, for example,
  68. # '>>' before '>'.
  69. #
  70. # Note that we don't have to detect malformed Javascript, only properly
  71. # lex correct Javascript, so much of this is simplified.
  72. # Details of Javascript lexical structure are taken from
  73. # http://www.ecma-international.org/publications/files/ECMA-ST/ECMA-262.pdf
  74. # A useful explanation of automatic semicolon insertion is at
  75. # http://inimino.org/~inimino/blog/javascript_semicolons
  76. both_before = [
  77. Tok("comment", r"/\*(.|\n)*?\*/"),
  78. Tok("linecomment", r"//.*?$"),
  79. Tok("ws", r"\s+"),
  80. Tok("keyword", literals("""
  81. break case catch class const continue debugger
  82. default delete do else enum export extends
  83. finally for function if import in instanceof
  84. new return super switch this throw try typeof
  85. var void while with
  86. """, suffix=r"\b"), next='reg'),
  87. Tok("reserved", literals("null true false", suffix=r"\b"), next='div'),
  88. Tok("id", r"""
  89. ([a-zA-Z_$ ]|\\u[0-9a-fA-Z]{4}) # first char
  90. ([a-zA-Z_$0-9]|\\u[0-9a-fA-F]{4})* # rest chars
  91. """, next='div'),
  92. Tok("hnum", r"0[xX][0-9a-fA-F]+", next='div'),
  93. Tok("onum", r"0[0-7]+"),
  94. Tok("dnum", r"""
  95. ( (0|[1-9][0-9]*) # DecimalIntegerLiteral
  96. \. # dot
  97. [0-9]* # DecimalDigits-opt
  98. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  99. |
  100. \. # dot
  101. [0-9]+ # DecimalDigits
  102. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  103. |
  104. (0|[1-9][0-9]*) # DecimalIntegerLiteral
  105. ([eE][-+]?[0-9]+)? # ExponentPart-opt
  106. )
  107. """, next='div'),
  108. Tok("punct", literals("""
  109. >>>= === !== >>> <<= >>= <= >= == != << >> &&
  110. || += -= *= %= &= |= ^=
  111. """), next="reg"),
  112. Tok("punct", literals("++ -- ) ]"), next='div'),
  113. Tok("punct", literals("{ } ( [ . ; , < > + - * % & | ^ ! ~ ? : ="), next='reg'),
  114. Tok("string", r'"([^"\\]|(\\(.|\n)))*?"', next='div'),
  115. Tok("string", r"'([^'\\]|(\\(.|\n)))*?'", next='div'),
  116. ]
  117. both_after = [
  118. Tok("other", r"."),
  119. ]
  120. states = {
  121. # slash will mean division
  122. 'div': both_before + [
  123. Tok("punct", literals("/= /"), next='reg'),
  124. ] + both_after,
  125. # slash will mean regex
  126. 'reg': both_before + [
  127. Tok("regex",
  128. r"""
  129. / # opening slash
  130. # First character is..
  131. ( [^*\\/[] # anything but * \ / or [
  132. | \\. # or an escape sequence
  133. | \[ # or a class, which has
  134. ( [^\]\\] # anything but \ or ]
  135. | \\. # or an escape sequence
  136. )* # many times
  137. \]
  138. )
  139. # Following characters are same, except for excluding a star
  140. ( [^\\/[] # anything but \ / or [
  141. | \\. # or an escape sequence
  142. | \[ # or a class, which has
  143. ( [^\]\\] # anything but \ or ]
  144. | \\. # or an escape sequence
  145. )* # many times
  146. \]
  147. )* # many times
  148. / # closing slash
  149. [a-zA-Z0-9]* # trailing flags
  150. """, next='div'),
  151. ] + both_after,
  152. }
  153. def __init__(self):
  154. super().__init__(self.states, 'reg')
  155. def prepare_js_for_gettext(js):
  156. """
  157. Convert the Javascript source `js` into something resembling C for
  158. xgettext.
  159. What actually happens is that all the regex literals are replaced with
  160. "REGEX".
  161. """
  162. def escape_quotes(m):
  163. """Used in a regex to properly escape double quotes."""
  164. s = m.group(0)
  165. if s == '"':
  166. return r'\"'
  167. else:
  168. return s
  169. lexer = JsLexer()
  170. c = []
  171. for name, tok in lexer.lex(js):
  172. if name == 'regex':
  173. # C doesn't grok regexes, and they aren't needed for gettext,
  174. # so just output a string instead.
  175. tok = '"REGEX"'
  176. elif name == 'string':
  177. # C doesn't have single-quoted strings, so make all strings
  178. # double-quoted.
  179. if tok.startswith("'"):
  180. guts = re.sub(r"\\.|.", escape_quotes, tok[1:-1])
  181. tok = '"' + guts + '"'
  182. elif name == 'id':
  183. # C can't deal with Unicode escapes in identifiers. We don't
  184. # need them for gettext anyway, so replace them with something
  185. # innocuous
  186. tok = tok.replace("\\", "U")
  187. c.append(tok)
  188. return ''.join(c)