Ticket #110: babel-javascript-extractor.patch
| File babel-javascript-extractor.patch, 14.0 KB (added by aronacher, 4 years ago) |
|---|
-
babel/messages/extract.py
30 30 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING 31 31 32 32 from babel.util import parse_encoding, pathmatch, relpath 33 from textwrap import dedent 33 34 34 35 __all__ = ['extract', 'extract_from_dir', 'extract_from_file'] 35 36 __docformat__ = 'restructuredtext en' … … 404 405 funcname = None 405 406 elif tok == NAME and value in keywords: 406 407 funcname = value 408 409 def extract_javascript(fileobj, keywords, comment_tags, options): 410 """Extract messages from JavaScript source code. 411 412 :param fileobj: the seekable, file-like object the messages should be 413 extracted from 414 :param keywords: a list of keywords (i.e. function names) that should be 415 recognized as translation functions 416 :param comment_tags: a list of translator tags to search for and include 417 in the results 418 :param options: a dictionary of additional options (optional) 419 :return: an iterator over ``(lineno, funcname, message, comments)`` tuples 420 :rtype: ``iterator`` 421 """ 422 from babel.messages.jslexer import tokenize, unquote_string 423 funcname = message_lineno = None 424 messages = [] 425 last_argument = None 426 translator_comments = [] 427 translator_comment_tag = None 428 encoding = options.get('encoding', 'utf-8') 429 last_token = None 430 call_stack = -1 431 432 for token in tokenize(fileobj.read().decode(encoding)): 433 if token.type == 'operator' and token.value == '(': 434 if funcname: 435 message_lineno = token.lineno 436 call_stack += 1 437 438 elif token.type == 'linecomment': 439 value = token.value[2:].strip() 440 if translator_comment_tag is not None and \ 441 translator_comments[-1][0] == token.lineno - 1: 442 if value.startswith(translator_comment_tag): 443 value = value[len(translator_comment_tag):].strip() 444 translator_comments.append((token.lineno, value)) 445 continue 446 447 for comment_tag in comment_tags: 448 if value.startswith(comment_tag): 449 translator_comment_tag = comment_tag 450 value = value[len(comment_tag):].strip() 451 translator_comments.append((token.lineno, value)) 452 break 453 454 elif token.type == 'multilinecomment': 455 value = token.value[2:-2].strip() 456 for comment_tag in comment_tags: 457 if value.startswith(comment_tag): 458 lines = value[len(comment_tag):].splitlines() 459 value = ''.join([lines[0].strip() + '\n', 460 dedent('\n'.join(lines[1:]))]) 461 translator_comments.append((token.lineno, value)) 462 break 463 464 elif funcname and call_stack == 0: 465 if token.type == 'operator' and token.value == ')': 466 if last_argument is not None: 467 messages.append(last_argument) 468 if len(messages) > 1: 469 messages = tuple(messages) 470 elif messages: 471 messages = messages[0] 472 else: 473 messages = None 474 475 if messages is not None: 476 yield (message_lineno, funcname, messages, 477 [comment[1] for comment in translator_comments]) 478 479 funcname = message_lineno = last_argument = None 480 translator_comments = [] 481 messages = [] 482 translator_comment_tag = None 483 call_stack = -1 484 485 elif token.type == 'string': 486 last_argument = unquote_string(token.value) 487 488 elif token.type == 'operator' and token.value == ',': 489 if last_argument is not None: 490 messages.append(last_argument) 491 last_argument = None 492 else: 493 messages.append(None) 494 495 elif call_stack > 0 and token.type == 'operator' \ 496 and token.value == ')': 497 call_stack -= 1 498 499 elif funcname and call_stack == -1: 500 funcname = None 501 502 elif call_stack == -1 and token.type == 'name' and \ 503 token.value in keywords and \ 504 (last_token is None or last_token.type != 'name' or 505 last_token.value != 'function'): 506 funcname = token.value 507 508 last_token = token -
babel/messages/tests/extract.py
303 303 self.assertEqual(u'Bonjour à tous', messages[0][2]) 304 304 self.assertEqual(messages[0][2], messages[1][2]) 305 305 306 class ExtractJavaScriptTestCase(unittest.TestCase): 307 308 def test_simple_extract(self): 309 buf = StringIO("""\ 310 msg1 = _('simple') 311 msg2 = gettext('simple') 312 msg3 = ngettext('s', 'p', 42) 313 """) 314 messages = \ 315 list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, 316 [], {})) 317 318 self.assertEqual([(1, 'simple', []), 319 (2, 'simple', []), 320 (3, ('s', 'p'), [])], messages) 321 322 def test_various_calls(self): 323 buf = StringIO("""\ 324 msg1 = _(i18n_arg.replace(/"/, '"')) 325 msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2) 326 msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2) 327 msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2) 328 msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1)) 329 msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1)) 330 msg7 = _(hello.there) 331 msg8 = gettext('Rabbit') 332 msg9 = dgettext('wiki', model.addPage()) 333 msg10 = dngettext(domain, 'Page', 'Pages', 3) 334 """) 335 messages = \ 336 list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [], 337 {})) 338 self.assertEqual([(5, (u'bunny', u'bunnies'), []), 339 (8, u'Rabbit', []), 340 (10, (u'Page', u'Pages'), [])], messages) 341 342 def test_message_with_line_comment(self): 343 buf = StringIO("""\ 344 // NOTE: hello 345 msg = _('Bonjour à tous') 346 """) 347 messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) 348 self.assertEqual(u'Bonjour à tous', messages[0][2]) 349 self.assertEqual([u'hello'], messages[0][3]) 350 351 def test_message_with_multiline_comment(self): 352 buf = StringIO("""\ 353 /* NOTE: hello 354 and bonjour 355 and servus */ 356 msg = _('Bonjour à tous') 357 """) 358 messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) 359 self.assertEqual(u'Bonjour à tous', messages[0][2]) 360 self.assertEqual([u'hello\nand bonjour\n and servus'], messages[0][3]) 361 362 def test_ignore_function_definitions(self): 363 buf = StringIO("""\ 364 function gettext(value) { 365 return translations[language][value] || value; 366 }""") 367 368 messages = list(extract.extract_javascript(buf, ('gettext',), [], {})) 369 self.assertEqual(messages, []) 370 371 306 372 class ExtractTestCase(unittest.TestCase): 307 373 308 374 def test_invalid_filter(self): … … 364 430 suite = unittest.TestSuite() 365 431 suite.addTest(doctest.DocTestSuite(extract)) 366 432 suite.addTest(unittest.makeSuite(ExtractPythonTestCase)) 433 suite.addTest(unittest.makeSuite(ExtractJavaScriptTestCase)) 367 434 suite.addTest(unittest.makeSuite(ExtractTestCase)) 368 435 return suite 369 436 -
babel/messages/jslexer.py
1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2008 Edgewall Software 4 # All rights reserved. 5 # 6 # This software is licensed as described in the file COPYING, which 7 # you should have received as part of this distribution. The terms 8 # are also available at http://babel.edgewall.org/wiki/License. 9 # 10 # This software consists of voluntary contributions made by many 11 # individuals. For the exact contribution history, see the revision 12 # history and logs, available at http://babel.edgewall.org/log/. 13 14 """A simple JavaScript 1.5 lexer which is used for the JavaScript 15 extractor. 16 """ 17 18 import re 19 from operator import itemgetter 20 21 22 operators = [ 23 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', 24 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', 25 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', 26 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.' 27 ] 28 operators.sort(lambda a, b: cmp(-len(a), -len(b))) 29 30 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} 31 32 rules = [ 33 (None, re.compile(r'\s+(?u)')), 34 ('linecomment', re.compile(r'//.*')), 35 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), 36 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), 37 ('number', re.compile(r'''(?x)( 38 (?:0|[1-9]\d*) 39 (\.\d+)? 40 ([eE][-+]?\d+)? 41 )''')), 42 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), 43 ('string', re.compile(r'''(?xs)( 44 '(?:[^'\\]*(?:\\.[^'\\]*)*)' | 45 "(?:[^"\\]*(?:\\.[^"\\]*)*)" 46 )''')) 47 ] 48 49 division_re = re.compile(r'/=?') 50 regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)') 51 line_re = re.compile(r'(\r\n|\n|\r)') 52 line_join_re = re.compile(r'\\' + line_re.pattern) 53 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') 54 55 56 class TokenError(ValueError): 57 """Raised if the tokenizer stumbled upon invalid tokens.""" 58 59 class Token(tuple): 60 """Represents a token as returned by `tokenize`.""" 61 __slots__ = () 62 63 def __new__(cls, type, value, lineno): 64 return tuple.__new__(cls, (type, value, lineno)) 65 66 type = property(itemgetter(0)) 67 value = property(itemgetter(1)) 68 lineno = property(itemgetter(2)) 69 70 def indicates_division(token): 71 """A helper function that helps the tokenizer to decide if the current 72 token may be followed by a division operator. 73 """ 74 if token.type == 'operator': 75 return token.value in (')', ']', '}', '++', '--') 76 return token.type in ('name', 'number', 'string', 'regexp') 77 78 def unquote_string(string): 79 """Unquote a string with JavaScript rules. The string has to start with 80 string delimiters (``'`` or ``"``.) 81 82 :return: a string 83 """ 84 assert string and string[0] == string[-1] and string[0] in '"\'', \ 85 'string provided is not properly delimited' 86 string = line_join_re.sub('\\1', string[1:-1]) 87 result = [] 88 add = result.append 89 pos = 0 90 91 while 1: 92 # scan for the next escape 93 escape_pos = string.find('\\', pos) 94 if escape_pos < 0: 95 break 96 add(string[pos:escape_pos]) 97 98 # check which character is escaped 99 next_char = string[escape_pos + 1] 100 if next_char in escapes: 101 add(escapes[next_char]) 102 103 # unicode escapes. trie to consume up to four characters of 104 # hexadecimal characters and try to interpret them as unicode 105 # character point. If there is no such character point, put 106 # all the consumed characters into the string. 107 elif next_char in 'uU': 108 escaped = uni_escape_re.match(string, escape_pos + 2) 109 if escaped is not None: 110 escaped_value = escaped.group() 111 if len(escaped_value) == 4: 112 try: 113 add(unichr(int(escaped_value, 16))) 114 except ValueError: 115 pass 116 else: 117 pos = escape_pos + 6 118 continue 119 add(next_char + escaped_value) 120 pos = escaped.end() 121 continue 122 else: 123 add(next_char) 124 125 # bogus escape. Just remove the backslash. 126 else: 127 add(next_char) 128 pos = escape_pos + 2 129 130 if pos < len(string): 131 add(string[pos:]) 132 133 return u''.join(result) 134 135 def tokenize(source): 136 """Tokenize a JavaScript source. 137 138 :return: generator of `Token`\s 139 """ 140 may_divide = False 141 pos = 0 142 lineno = 1 143 end = len(source) 144 145 while pos < end: 146 # handle regular rules first 147 for token_type, rule in rules: 148 match = rule.match(source, pos) 149 if match is not None: 150 break 151 # if we don't have a match we don't give up yet, but check for 152 # division operators or regular expression literals, based on 153 # the status of `may_divide` which is determined by the last 154 # processed non-whitespace token using `indicates_division`. 155 else: 156 if may_divide: 157 match = division_re.match(source, pos) 158 token_type = 'operator' 159 else: 160 match = regex_re.match(source, pos) 161 token_type = 'regexp' 162 if match is None: 163 raise TokenError('invalid syntax around line %d' % lineno) 164 165 token_value = match.group() 166 if token_type is not None: 167 token = Token(token_type, token_value, lineno) 168 may_divide = indicates_division(token) 169 yield token 170 lineno += len(line_re.findall(token_value)) 171 pos = match.end() -
setup.py
75 75 [babel.extractors] 76 76 ignore = babel.messages.extract:extract_nothing 77 77 python = babel.messages.extract:extract_python 78 javascript = babel.messages.extract:extract_javascript 78 79 """, 79 80 80 81 cmdclass = {'build_doc': build_doc, 'test_doc': test_doc}
