feat: JS extractor and custom JS parser

This commit is contained in:
Ankush Menat 2022-08-02 18:23:45 +05:30
parent 1425842ef0
commit cd53466b6a
2 changed files with 203 additions and 3 deletions

View file

@ -10,6 +10,8 @@ import frappe
import frappe.translate
from frappe import _
from frappe.translate import (
extract_javascript,
extract_messages_from_javascript_code,
extract_messages_from_python_code,
get_language,
get_parent_language,
@ -131,7 +133,7 @@ class TestTranslate(unittest.TestCase):
"""Load all CSV files to ensure they have correct format"""
verify_translation_files("frappe")
def test_python_ast_extractor(self):
def test_python_extractor(self):
code = textwrap.dedent(
"""
@ -160,7 +162,61 @@ class TestTranslate(unittest.TestCase):
]
output = extract_messages_from_python_code(code)
self.assertEqual(output, expected_output, msg=output)
self.assertEqual(len(expected_output), len(output))
for expected, actual in zip(expected_output, output):
with self.subTest():
self.assertEqual(expected, actual)
def test_js_extractor(self):
code = textwrap.dedent(
"""
__("attr")
__("attr with", null, "context")
__("attr with", ["format", "replacements"], "context")
__("attr with", ["format", "replacements"])
__(
"Long JS string with", [
"format", "replacements"
],
"JS context on newline"
)
__(
"Long JS string with formats only {0}", [
"format", "replacements"
],
)
_(`template strings not supported yet`)
"""
)
expected_output = [
(2, "attr", None),
(3, "attr with", "context"),
(4, "attr with", "context"),
(5, "attr with", None),
(6, "Long JS string with", "JS context on newline"),
(12, "Long JS string with formats only {0}", None),
]
output = extract_messages_from_javascript_code(code)
self.assertEqual(len(expected_output), len(output))
for expected, actual in zip(expected_output, output):
with self.subTest():
self.assertEqual(expected, actual)
def test_js_parser_arg_capturing(self):
"""Get non-flattened args in correct order so 3rd arg if present is always context."""
def get_args(code):
*__, args = next(extract_javascript(code))
return args
args = get_args("""__("attr with", ["format", "replacements"], "context")""")
self.assertEqual(args, ("attr with", None, "context"))
args = get_args("""__("attr with", ["format", "replacements"])""")
self.assertEqual(args, ("attr with", None))
def verify_translation_files(app):

View file

@ -17,6 +17,7 @@ import re
from csv import reader
from babel.messages.extract import extract_python
from babel.messages.jslexer import Token, tokenize, unquote_string
from pypika.terms import PseudoColumn
import frappe
@ -707,6 +708,8 @@ def get_messages_from_file(path: str) -> list[tuple[str, str, str | None, int]]:
if path.lower().endswith(".py"):
messages = extract_messages_from_python_code(file_contents)
elif path.lower().endswith(".js"):
messages = extract_messages_from_javascript_code(file_contents)
else:
messages = extract_messages_from_code(file_contents)
return [
@ -718,7 +721,7 @@ def get_messages_from_file(path: str) -> list[tuple[str, str, str | None, int]]:
def extract_messages_from_python_code(code: str) -> list[tuple[int, str, str | None]]:
"""Extracts translatable strings from python code using AST"""
"""Extracts translatable strings from Python code using babel."""
messages = []
@ -741,6 +744,147 @@ def extract_messages_from_python_code(code: str) -> list[tuple[int, str, str | N
return messages
def extract_messages_from_javascript_code(code: str) -> list[tuple[int, str, str | None]]:
"""Extracts translatable strings from JavaScript code using babel."""
messages = []
for message in extract_javascript(
code,
keywords=["__"],
options={},
):
lineno, _func, args = message
if not args or not args[0]:
continue
source_text = args[0] if isinstance(args, tuple) else args
context = None
if isinstance(args, tuple) and len(args) == 3 and isinstance(args[2], str):
context = args[2]
messages.append((lineno, source_text, context))
return messages
def extract_javascript(code, keywords=("__"), options=None):
"""Extract messages from JavaScript source code.
This is a modified version of babel's JS parser. Reused under BSD license.
License: https://github.com/python-babel/babel/blob/master/LICENSE
Changes from upstream:
- Preserve arguments, babel's parser flattened all values in args,
we need order because we use different syntax for translation
which can contain 2nd arg which is array of many values. If
argument is non-primitive type then value is NOT returned in
args.
E.g. __("0", ["1", "2"], "3") -> ("0", None, "3")
- remove comments support
- changed signature to accept string directly.
:param code: code as string
:param keywords: a list of keywords (i.e. function names) that should be
recognized as translation functions
:param options: a dictionary of additional options (optional)
Supported options are:
* `template_string` -- set to false to disable ES6
template string support.
"""
if options is None:
options = {}
funcname = message_lineno = None
messages = []
last_argument = None
concatenate_next = False
last_token = None
call_stack = -1
dotted = any("." in kw for kw in keywords)
for token in tokenize(
code,
jsx=True,
template_string=options.get("template_string", True),
dotted=dotted,
):
if ( # Turn keyword`foo` expressions into keyword("foo") calls:
funcname
and (last_token and last_token.type == "name") # have a keyword...
and token.type # we've seen nothing after the keyword...
== "template_string" # this is a template string
):
message_lineno = token.lineno
messages = [unquote_string(token.value)]
call_stack = 0
token = Token("operator", ")", token.lineno)
if token.type == "operator" and token.value == "(":
if funcname:
message_lineno = token.lineno
call_stack += 1
elif call_stack == -1 and token.type == "linecomment" or token.type == "multilinecomment":
pass
elif funcname and call_stack == 0:
if token.type == "operator" and token.value == ")":
if last_argument is not None:
messages.append(last_argument)
if len(messages) > 1:
messages = tuple(messages)
elif messages:
messages = messages[0]
else:
messages = None
if messages is not None:
yield (message_lineno, funcname, messages)
funcname = message_lineno = last_argument = None
concatenate_next = False
messages = []
call_stack = -1
elif token.type in ("string", "template_string"):
new_value = unquote_string(token.value)
if concatenate_next:
last_argument = (last_argument or "") + new_value
concatenate_next = False
else:
last_argument = new_value
elif token.type == "operator":
if token.value == ",":
if last_argument is not None:
messages.append(last_argument)
last_argument = None
else:
messages.append(None)
concatenate_next = False
elif token.value == "+":
concatenate_next = True
elif call_stack > 0 and token.type == "operator" and token.value == ")":
call_stack -= 1
elif funcname and call_stack == -1:
funcname = None
elif (
call_stack == -1
and token.type == "name"
and token.value in keywords
and (last_token is None or last_token.type != "name" or last_token.value != "function")
):
funcname = token.value
last_token = token
def extract_messages_from_code(code):
"""
Extracts translatable strings from a code file