Ticket #110: genshi-javascript-extractor-improved.patch
| File genshi-javascript-extractor-improved.patch, 15.3 KB (added by aronacher, 4 years ago) |
|---|
-
babel/messages/extract.py
30 30 from tokenize import generate_tokens, COMMENT, NAME, OP, STRING 31 31 32 32 from babel.util import parse_encoding, pathmatch, relpath 33 from textwrap import dedent 33 34 34 35 __all__ = ['extract', 'extract_from_dir', 'extract_from_file'] 35 36 __docformat__ = 'restructuredtext en' … … 404 405 funcname = None 405 406 elif tok == NAME and value in keywords: 406 407 funcname = value 408 409 def extract_javascript(fileobj, keywords, comment_tags, options): 410 """Extract messages from JavaScript source code. 411 412 :param fileobj: the seekable, file-like object the messages should be 413 extracted from 414 :param keywords: a list of keywords (i.e. function names) that should be 415 recognized as translation functions 416 :param comment_tags: a list of translator tags to search for and include 417 in the results 418 :param options: a dictionary of additional options (optional) 419 :return: an iterator over ``(lineno, funcname, message, comments)`` tuples 420 :rtype: ``iterator`` 421 """ 422 from babel.messages.jslexer import tokenize, unquote_string 423 funcname = message_lineno = None 424 messages = [] 425 last_argument = None 426 translator_comments = [] 427 translator_comment_tag = None 428 encoding = options.get('encoding', 'utf-8') 429 last_token = None 430 call_stack = -1 431 432 for token in tokenize(fileobj.read().decode(encoding)): 433 if token.type == 'operator' and token.value == '(': 434 if funcname: 435 message_lineno = token.lineno 436 call_stack += 1 437 438 elif call_stack == -1 and token.type == 'linecomment': 439 value = token.value[2:].strip() 440 if translator_comment_tag is not None and \ 441 translator_comments[-1][0] == token.lineno - 1: 442 if value.startswith(translator_comment_tag): 443 value = value[len(translator_comment_tag):].strip() 444 translator_comments.append((token.lineno, value)) 445 continue 446 447 for comment_tag in comment_tags: 448 if value.startswith(comment_tag): 449 translator_comment_tag = comment_tag 450 value = value[len(comment_tag):].strip() 451 translator_comments.append((token.lineno, value)) 452 break 453 454 elif token.type == 'multilinecomment': 455 # only one multi-line comment may preceed a translation 456 translator_comments = [] 457 value = token.value[2:-2].strip() 458 for comment_tag in comment_tags: 459 if value.startswith(comment_tag): 460 lines = value[len(comment_tag):].splitlines() 461 if lines: 462 lines[0] = lines[0].strip() 463 lines[1:] = dedent('\n'.join(lines[1:])).splitlines() 464 for offset, line in enumerate(lines): 465 translator_comments.append((token.lineno + offset, 466 line)) 467 break 468 469 elif funcname and call_stack == 0: 470 if token.type == 'operator' and token.value == ')': 471 if last_argument is not None: 472 messages.append(last_argument) 473 if len(messages) > 1: 474 messages = tuple(messages) 475 elif messages: 476 messages = messages[0] 477 else: 478 messages = None 479 480 # Comments don't apply unless they immediately preceed the 481 # message 482 if translator_comments and \ 483 translator_comments[-1][0] < message_lineno - 1: 484 translator_comments = [] 485 486 if messages is not None: 487 yield (message_lineno, funcname, messages, 488 [comment[1] for comment in translator_comments]) 489 490 funcname = message_lineno = last_argument = None 491 translator_comments = [] 492 messages = [] 493 translator_comment_tag = None 494 call_stack = -1 495 496 elif token.type == 'string': 497 last_argument = unquote_string(token.value) 498 499 elif token.type == 'operator' and token.value == ',': 500 if last_argument is not None: 501 messages.append(last_argument) 502 last_argument = None 503 else: 504 messages.append(None) 505 506 elif call_stack > 0 and token.type == 'operator' \ 507 and token.value == ')': 508 call_stack -= 1 509 510 elif funcname and call_stack == -1: 511 funcname = None 512 513 elif call_stack == -1 and token.type == 'name' and \ 514 token.value in keywords and \ 515 (last_token is None or last_token.type != 'name' or 516 last_token.value != 'function'): 517 funcname = token.value 518 519 last_token = token -
babel/messages/tests/extract.py
303 303 self.assertEqual(u'Bonjour à tous', messages[0][2]) 304 304 self.assertEqual(messages[0][2], messages[1][2]) 305 305 306 class ExtractJavaScriptTestCase(unittest.TestCase): 307 308 def test_simple_extract(self): 309 buf = StringIO("""\ 310 msg1 = _('simple') 311 msg2 = gettext('simple') 312 msg3 = ngettext('s', 'p', 42) 313 """) 314 messages = \ 315 list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, 316 [], {})) 317 318 self.assertEqual([(1, 'simple', []), 319 (2, 'simple', []), 320 (3, ('s', 'p'), [])], messages) 321 322 def test_various_calls(self): 323 buf = StringIO("""\ 324 msg1 = _(i18n_arg.replace(/"/, '"')) 325 msg2 = ungettext(i18n_arg.replace(/"/, '"'), multi_arg.replace(/"/, '"'), 2) 326 msg3 = ungettext("Babel", multi_arg.replace(/"/, '"'), 2) 327 msg4 = ungettext(i18n_arg.replace(/"/, '"'), "Babels", 2) 328 msg5 = ungettext('bunny', 'bunnies', parseInt(Math.random() * 2 + 1)) 329 msg6 = ungettext(arg0, 'bunnies', rparseInt(Math.random() * 2 + 1)) 330 msg7 = _(hello.there) 331 msg8 = gettext('Rabbit') 332 msg9 = dgettext('wiki', model.addPage()) 333 msg10 = dngettext(domain, 'Page', 'Pages', 3) 334 """) 335 messages = \ 336 list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS, [], 337 {})) 338 self.assertEqual([(5, (u'bunny', u'bunnies'), []), 339 (8, u'Rabbit', []), 340 (10, (u'Page', u'Pages'), [])], messages) 341 342 def test_message_with_line_comment(self): 343 buf = StringIO("""\ 344 // NOTE: hello 345 msg = _('Bonjour à tous') 346 """) 347 messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) 348 self.assertEqual(u'Bonjour à tous', messages[0][2]) 349 self.assertEqual([u'hello'], messages[0][3]) 350 351 def test_message_with_multiline_comment(self): 352 buf = StringIO("""\ 353 /* NOTE: hello 354 and bonjour 355 and servus */ 356 msg = _('Bonjour à tous') 357 """) 358 messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) 359 self.assertEqual(u'Bonjour à tous', messages[0][2]) 360 self.assertEqual([u'hello', 'and bonjour', ' and servus'], messages[0][3]) 361 362 def test_ignore_function_definitions(self): 363 buf = StringIO("""\ 364 function gettext(value) { 365 return translations[language][value] || value; 366 }""") 367 368 messages = list(extract.extract_javascript(buf, ('gettext',), [], {})) 369 self.assertEqual(messages, []) 370 371 def test_misplaced_comments(self): 372 buf = StringIO("""\ 373 /* NOTE: this won't show up */ 374 foo() 375 376 /* NOTE: this will */ 377 msg = _('Something') 378 379 // NOTE: this will show up 380 // too. 381 msg = _('Something else') 382 383 // NOTE: but this won't 384 bar() 385 386 _('no comment here') 387 """) 388 messages = list(extract.extract_javascript(buf, ('_',), ['NOTE:'], {})) 389 self.assertEqual(u'Something', messages[0][2]) 390 self.assertEqual([u'this will'], messages[0][3]) 391 self.assertEqual(u'Something else', messages[1][2]) 392 self.assertEqual([u'this will show up', 'too.'], messages[1][3]) 393 self.assertEqual(u'no comment here', messages[2][2]) 394 self.assertEqual([], messages[2][3]) 395 306 396 class ExtractTestCase(unittest.TestCase): 307 397 308 398 def test_invalid_filter(self): … … 364 454 suite = unittest.TestSuite() 365 455 suite.addTest(doctest.DocTestSuite(extract)) 366 456 suite.addTest(unittest.makeSuite(ExtractPythonTestCase)) 457 suite.addTest(unittest.makeSuite(ExtractJavaScriptTestCase)) 367 458 suite.addTest(unittest.makeSuite(ExtractTestCase)) 368 459 return suite 369 460 -
babel/messages/jslexer.py
1 # -*- coding: utf-8 -*- 2 # 3 # Copyright (C) 2008 Edgewall Software 4 # All rights reserved. 5 # 6 # This software is licensed as described in the file COPYING, which 7 # you should have received as part of this distribution. The terms 8 # are also available at http://babel.edgewall.org/wiki/License. 9 # 10 # This software consists of voluntary contributions made by many 11 # individuals. For the exact contribution history, see the revision 12 # history and logs, available at http://babel.edgewall.org/log/. 13 14 """A simple JavaScript 1.5 lexer which is used for the JavaScript 15 extractor. 16 """ 17 18 import re 19 from operator import itemgetter 20 21 22 operators = [ 23 '+', '-', '*', '%', '!=', '==', '<', '>', '<=', '>=', '=', 24 '+=', '-=', '*=', '%=', '<<', '>>', '>>>', '<<=', '>>=', 25 '>>>=', '&', '&=', '|', '|=', '&&', '||', '^', '^=', '(', ')', 26 '[', ']', '{', '}', '!', '--', '++', '~', ',', ';', '.' 27 ] 28 operators.sort(lambda a, b: cmp(-len(a), -len(b))) 29 30 escapes = {'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t'} 31 32 rules = [ 33 (None, re.compile(r'\s+(?u)')), 34 (None, re.compile(r'<!--.*')), 35 ('linecomment', re.compile(r'//.*')), 36 ('multilinecomment', re.compile(r'/\*.*?\*/(?us)')), 37 ('name', re.compile(r'(\$+\w*|[^\W\d]\w*)(?u)')), 38 ('number', re.compile(r'''(?x)( 39 (?:0|[1-9]\d*) 40 (\.\d+)? 41 ([eE][-+]?\d+)? | 42 (0x[a-fA-F0-9]+) 43 )''')), 44 ('operator', re.compile(r'(%s)' % '|'.join(map(re.escape, operators)))), 45 ('string', re.compile(r'''(?xs)( 46 '(?:[^'\\]*(?:\\.[^'\\]*)*)' | 47 "(?:[^"\\]*(?:\\.[^"\\]*)*)" 48 )''')) 49 ] 50 51 division_re = re.compile(r'/=?') 52 regex_re = re.compile(r'/.+?/[a-zA-Z]*(?s)') 53 line_re = re.compile(r'(\r\n|\n|\r)') 54 line_join_re = re.compile(r'\\' + line_re.pattern) 55 uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}') 56 57 58 class TokenError(ValueError): 59 """Raised if the tokenizer stumbled upon invalid tokens.""" 60 61 class Token(tuple): 62 """Represents a token as returned by `tokenize`.""" 63 __slots__ = () 64 65 def __new__(cls, type, value, lineno): 66 return tuple.__new__(cls, (type, value, lineno)) 67 68 type = property(itemgetter(0)) 69 value = property(itemgetter(1)) 70 lineno = property(itemgetter(2)) 71 72 def indicates_division(token): 73 """A helper function that helps the tokenizer to decide if the current 74 token may be followed by a division operator. 75 """ 76 if token.type == 'operator': 77 return token.value in (')', ']', '}', '++', '--') 78 return token.type in ('name', 'number', 'string', 'regexp') 79 80 def unquote_string(string): 81 """Unquote a string with JavaScript rules. The string has to start with 82 string delimiters (``'`` or ``"``.) 83 84 :return: a string 85 """ 86 assert string and string[0] == string[-1] and string[0] in '"\'', \ 87 'string provided is not properly delimited' 88 string = line_join_re.sub('\\1', string[1:-1]) 89 result = [] 90 add = result.append 91 pos = 0 92 93 while 1: 94 # scan for the next escape 95 escape_pos = string.find('\\', pos) 96 if escape_pos < 0: 97 break 98 add(string[pos:escape_pos]) 99 100 # check which character is escaped 101 next_char = string[escape_pos + 1] 102 if next_char in escapes: 103 add(escapes[next_char]) 104 105 # unicode escapes. trie to consume up to four characters of 106 # hexadecimal characters and try to interpret them as unicode 107 # character point. If there is no such character point, put 108 # all the consumed characters into the string. 109 elif next_char in 'uU': 110 escaped = uni_escape_re.match(string, escape_pos + 2) 111 if escaped is not None: 112 escaped_value = escaped.group() 113 if len(escaped_value) == 4: 114 try: 115 add(unichr(int(escaped_value, 16))) 116 except ValueError: 117 pass 118 else: 119 pos = escape_pos + 6 120 continue 121 add(next_char + escaped_value) 122 pos = escaped.end() 123 continue 124 else: 125 add(next_char) 126 127 # bogus escape. Just remove the backslash. 128 else: 129 add(next_char) 130 pos = escape_pos + 2 131 132 if pos < len(string): 133 add(string[pos:]) 134 135 return u''.join(result) 136 137 def tokenize(source): 138 """Tokenize a JavaScript source. 139 140 :return: generator of `Token`\s 141 """ 142 may_divide = False 143 pos = 0 144 lineno = 1 145 end = len(source) 146 147 while pos < end: 148 # handle regular rules first 149 for token_type, rule in rules: 150 match = rule.match(source, pos) 151 if match is not None: 152 break 153 # if we don't have a match we don't give up yet, but check for 154 # division operators or regular expression literals, based on 155 # the status of `may_divide` which is determined by the last 156 # processed non-whitespace token using `indicates_division`. 157 else: 158 if may_divide: 159 match = division_re.match(source, pos) 160 token_type = 'operator' 161 else: 162 match = regex_re.match(source, pos) 163 token_type = 'regexp' 164 if match is None: 165 raise TokenError('invalid syntax around line %d' % lineno) 166 167 token_value = match.group() 168 if token_type is not None: 169 token = Token(token_type, token_value, lineno) 170 may_divide = indicates_division(token) 171 yield token 172 lineno += len(line_re.findall(token_value)) 173 pos = match.end() -
setup.py
75 75 [babel.extractors] 76 76 ignore = babel.messages.extract:extract_nothing 77 77 python = babel.messages.extract:extract_python 78 javascript = babel.messages.extract:extract_javascript 78 79 """, 79 80 80 81 cmdclass = {'build_doc': build_doc, 'test_doc': test_doc}
