534 lines
16 KiB
Python
534 lines
16 KiB
Python
"""Generate a review-friendly summary for large translation PRs.
|
|
|
|
This helper runs in GitHub Actions for bot-authored `.po` pull requests.
|
|
It compares the trusted base checkout against the PR head translation files,
|
|
groups similarly sized file diffs, and renders a markdown comment with the
|
|
high-signal translation changes that are hard to inspect in GitHub's UI.
|
|
"""
|
|
|
|
import argparse
|
|
import html
|
|
import io
|
|
import json
|
|
import os
|
|
import time
|
|
import urllib.parse
|
|
import urllib.request
|
|
from collections import Counter
|
|
from dataclasses import dataclass
|
|
from pathlib import Path
|
|
from typing import Any
|
|
from urllib.error import HTTPError
|
|
|
|
from babel.messages.pofile import read_po
|
|
|
|
COMMENT_MARKER = "<!-- po-translation-review -->"
|
|
SIMILARITY_TOLERANCE = 0.02
|
|
REVIEW_HIDDEN_PO_FILES = {"eo.po"}
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class TranslationEntry:
|
|
"""Normalized representation of a gettext entry used for diffing."""
|
|
|
|
context: str
|
|
msgid: str
|
|
msgid_plural: str | None
|
|
translation: tuple[str, ...]
|
|
|
|
@property
|
|
def key(self) -> tuple[str, str, str]:
|
|
return (self.context, self.msgid, self.msgid_plural or "")
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
parser = argparse.ArgumentParser(
|
|
description="Build a PR review comment for .po file changes in a GitHub pull request."
|
|
)
|
|
parser.add_argument("--repo", default=os.environ.get("GITHUB_REPOSITORY"))
|
|
parser.add_argument("--pr", type=int, default=os.environ.get("PR_NUMBER"))
|
|
parser.add_argument("--head-sha", default=os.environ.get("PR_HEAD_SHA"))
|
|
parser.add_argument("--output", default="po-pr-review-comment.md")
|
|
return parser.parse_args()
|
|
|
|
|
|
def request_url(url: str, *, accept: str, allow_missing: bool = False) -> bytes | None:
|
|
"""Fetch bytes from GitHub with auth, retries, and optional 404 handling."""
|
|
|
|
headers = {
|
|
"Accept": accept,
|
|
"X-GitHub-Api-Version": "2022-11-28",
|
|
"User-Agent": "frappe-po-review-helper",
|
|
}
|
|
token = os.environ.get("GITHUB_TOKEN")
|
|
if token:
|
|
headers["Authorization"] = f"Bearer {token}"
|
|
|
|
retries = 0
|
|
while True:
|
|
try:
|
|
request = urllib.request.Request(url, headers=headers)
|
|
with urllib.request.urlopen(request) as response:
|
|
return response.read()
|
|
except HTTPError as exc:
|
|
if exc.code == 404 and allow_missing:
|
|
return None
|
|
|
|
if exc.code in {403, 429, 500, 502, 503, 504} and retries < 5:
|
|
retries += 1
|
|
time.sleep(retries)
|
|
continue
|
|
|
|
raise
|
|
|
|
|
|
def request_json(url: str) -> Any:
|
|
response = request_url(url, accept="application/vnd.github+json")
|
|
if response is None:
|
|
return None
|
|
return json.loads(response.decode("utf-8"))
|
|
|
|
|
|
def fetch_pr_files(repo: str, pr_number: int) -> list[dict[str, Any]]:
|
|
"""Return the full changed-file list for a PR, following GitHub pagination."""
|
|
|
|
files: list[dict[str, Any]] = []
|
|
page = 1
|
|
|
|
while True:
|
|
url = f"https://api.github.com/repos/{repo}/pulls/{pr_number}/files?per_page=100&page={page}"
|
|
page_files = request_json(url) or []
|
|
if not page_files:
|
|
break
|
|
|
|
files.extend(page_files)
|
|
if len(page_files) < 100:
|
|
break
|
|
|
|
page += 1
|
|
|
|
return files
|
|
|
|
|
|
def read_local_file(path: str | None) -> str | None:
|
|
"""Read a file from the trusted base checkout while preventing path traversal."""
|
|
|
|
if not path:
|
|
return None
|
|
|
|
repo_root = Path.cwd().resolve()
|
|
file_path = (repo_root / path).resolve()
|
|
try:
|
|
file_path.relative_to(repo_root)
|
|
except ValueError as exc:
|
|
raise ValueError(f"Unexpected repository path: {path}") from exc
|
|
|
|
if not file_path.exists():
|
|
return None
|
|
|
|
return file_path.read_text(encoding="utf-8")
|
|
|
|
|
|
def fetch_file_content(repo: str, path: str | None, ref: str | None) -> str | None:
|
|
"""Fetch the raw content for a repository file at a specific git ref."""
|
|
|
|
if not path or not ref:
|
|
return None
|
|
|
|
quoted_path = urllib.parse.quote(path, safe="/")
|
|
quoted_ref = urllib.parse.quote(ref, safe="")
|
|
url = f"https://api.github.com/repos/{repo}/contents/{quoted_path}?ref={quoted_ref}"
|
|
response = request_url(url, accept="application/vnd.github.raw", allow_missing=True)
|
|
if response is None:
|
|
return None
|
|
return response.decode("utf-8")
|
|
|
|
|
|
def is_po_file(change: dict[str, Any]) -> bool:
|
|
current_path = change.get("filename", "")
|
|
previous_path = change.get("previous_filename", "")
|
|
return current_path.endswith(".po") or previous_path.endswith(".po")
|
|
|
|
|
|
def base_path_for_file(change: dict[str, Any]) -> str | None:
|
|
if change.get("status") == "renamed":
|
|
return change.get("previous_filename") or change.get("filename")
|
|
return change.get("filename")
|
|
|
|
|
|
def head_path_for_file(change: dict[str, Any]) -> str | None:
|
|
if change.get("status") == "removed":
|
|
return None
|
|
return change.get("filename")
|
|
|
|
|
|
def normalize_translation(value: Any) -> tuple[str, ...]:
|
|
if value is None:
|
|
return ("",)
|
|
if isinstance(value, (tuple, list)):
|
|
return tuple("" if part is None else str(part) for part in value)
|
|
return (str(value),)
|
|
|
|
|
|
def is_translation_empty(translation: tuple[str, ...]) -> bool:
|
|
"""Return whether every translated value in the entry is empty or whitespace."""
|
|
|
|
return not any(part.strip() for part in translation)
|
|
|
|
|
|
def normalize_message(message: Any) -> TranslationEntry:
|
|
if isinstance(message.id, tuple):
|
|
msgid, msgid_plural = message.id
|
|
else:
|
|
msgid, msgid_plural = message.id, None
|
|
|
|
return TranslationEntry(
|
|
context=message.context or "",
|
|
msgid=str(msgid),
|
|
msgid_plural=None if msgid_plural is None else str(msgid_plural),
|
|
translation=normalize_translation(message.string),
|
|
)
|
|
|
|
|
|
def load_translation_entries(
|
|
content: str | None,
|
|
) -> tuple[str | None, dict[tuple[str, str, str], TranslationEntry]]:
|
|
"""Parse `.po` content into normalized entries keyed for translation diffing.
|
|
|
|
The gettext header entry is skipped, and both singular and plural messages are
|
|
flattened into `TranslationEntry` objects so they can be compared uniformly.
|
|
"""
|
|
|
|
if not content:
|
|
return None, {}
|
|
|
|
catalog = read_po(io.StringIO(content))
|
|
language = str(catalog.locale) if catalog.locale else None
|
|
entries: dict[tuple[str, str, str], TranslationEntry] = {}
|
|
|
|
for message in catalog:
|
|
if not message.id:
|
|
continue
|
|
|
|
entry = normalize_message(message)
|
|
entries[entry.key] = entry
|
|
|
|
return language, entries
|
|
|
|
|
|
def compare_entries(
|
|
base_entries: dict[tuple[str, str, str], TranslationEntry],
|
|
head_entries: dict[tuple[str, str, str], TranslationEntry],
|
|
) -> list[dict[str, TranslationEntry | str | None]]:
|
|
"""Return only the translations that are new or changed in the PR head.
|
|
|
|
Removed entries are not included here because reviewers primarily need to
|
|
inspect what was introduced or modified in the new translation state. Brand
|
|
new entries with empty `msgstr` values are also skipped to avoid noisy review
|
|
tables for untranslated strings.
|
|
"""
|
|
|
|
changes: list[dict[str, TranslationEntry | str | None]] = []
|
|
|
|
for key in sorted(head_entries, key=lambda item: (item[0].lower(), item[1].lower(), item[2].lower())):
|
|
head_entry = head_entries[key]
|
|
base_entry = base_entries.get(key)
|
|
|
|
if base_entry is None:
|
|
if is_translation_empty(head_entry.translation):
|
|
continue
|
|
changes.append({"status": "added", "before": None, "after": head_entry})
|
|
continue
|
|
|
|
if base_entry.translation != head_entry.translation:
|
|
changes.append({"status": "changed", "before": base_entry, "after": head_entry})
|
|
|
|
return changes
|
|
|
|
|
|
def within_tolerance(value: int, reference: float, tolerance: float = SIMILARITY_TOLERANCE) -> bool:
|
|
if reference == 0:
|
|
return value == 0
|
|
|
|
allowed_delta = max(1, round(reference * tolerance))
|
|
return abs(value - reference) <= allowed_delta
|
|
|
|
|
|
def cluster_similar_change_sizes(changes: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""Group files whose added and removed line counts are within the tolerance.
|
|
|
|
This helps spot bulk-generated translation updates where many locale files were
|
|
changed in nearly the same way.
|
|
"""
|
|
|
|
clusters: list[dict[str, Any]] = []
|
|
|
|
sorted_changes = sorted(
|
|
changes,
|
|
key=lambda item: (-item.get("additions", 0), -item.get("deletions", 0), item.get("filename", "")),
|
|
)
|
|
|
|
for change in sorted_changes:
|
|
additions = change.get("additions", 0)
|
|
deletions = change.get("deletions", 0)
|
|
|
|
for cluster in clusters:
|
|
if within_tolerance(additions, cluster["avg_additions"]) and within_tolerance(
|
|
deletions, cluster["avg_deletions"]
|
|
):
|
|
cluster["files"].append(change)
|
|
cluster["avg_additions"] = sum(file["additions"] for file in cluster["files"]) / len(
|
|
cluster["files"]
|
|
)
|
|
cluster["avg_deletions"] = sum(file["deletions"] for file in cluster["files"]) / len(
|
|
cluster["files"]
|
|
)
|
|
break
|
|
else:
|
|
clusters.append(
|
|
{
|
|
"files": [change],
|
|
"avg_additions": float(additions),
|
|
"avg_deletions": float(deletions),
|
|
}
|
|
)
|
|
|
|
return sorted(
|
|
[cluster for cluster in clusters if len(cluster["files"]) > 1],
|
|
key=lambda cluster: (-len(cluster["files"]), -cluster["avg_additions"], -cluster["avg_deletions"]),
|
|
)
|
|
|
|
|
|
def format_translation(translation: tuple[str, ...]) -> str:
|
|
if len(translation) == 1:
|
|
return translation[0]
|
|
|
|
return "\n".join(f"[{index}] {value or '(empty)'}" for index, value in enumerate(translation))
|
|
|
|
|
|
def escape_table_cell(value: str) -> str:
|
|
if not value:
|
|
return "<em>empty</em>"
|
|
|
|
return html.escape(value).replace("|", "|").replace("\n", "<br>")
|
|
|
|
|
|
def render_msgid(entry: TranslationEntry) -> str:
|
|
parts = [entry.msgid]
|
|
if entry.msgid_plural:
|
|
parts.append(f"[plural] {entry.msgid_plural}")
|
|
return "\n".join(parts)
|
|
|
|
|
|
def should_hide_report_from_review(report: dict[str, Any]) -> bool:
|
|
"""Return whether a file should be omitted from reviewer-facing language details."""
|
|
|
|
return Path(str(report["path"])).name in REVIEW_HIDDEN_PO_FILES
|
|
|
|
|
|
def build_language_section(report: dict[str, Any]) -> list[str]:
|
|
"""Render one language's added or changed translations as a markdown table."""
|
|
|
|
lines = [
|
|
f"### `{report['language']}` (`{report['path']}`)",
|
|
"",
|
|
"| Status | Msgid | Previous | Current |",
|
|
"| --- | --- | --- | --- |",
|
|
]
|
|
|
|
for change in report["changes"]:
|
|
before = change["before"]
|
|
after = change["after"]
|
|
after = after if isinstance(after, TranslationEntry) else None
|
|
before = before if isinstance(before, TranslationEntry) else None
|
|
|
|
if after is None:
|
|
continue
|
|
|
|
lines.append(
|
|
"| "
|
|
+ " | ".join(
|
|
[
|
|
str(change["status"]),
|
|
escape_table_cell(render_msgid(after)),
|
|
escape_table_cell("" if before is None else format_translation(before.translation)),
|
|
escape_table_cell(format_translation(after.translation)),
|
|
]
|
|
)
|
|
+ " |"
|
|
)
|
|
|
|
lines.append("")
|
|
return lines
|
|
|
|
|
|
def build_comment(
|
|
po_files: list[dict[str, Any]],
|
|
language_reports: list[dict[str, Any]],
|
|
similar_groups: list[dict[str, Any]],
|
|
parse_errors: list[dict[str, str]],
|
|
) -> str:
|
|
"""Build the final PR comment with stats, grouped diffs, and translation tables.
|
|
|
|
The result is intentionally compact at the top and expandable below so large
|
|
translation PRs stay reviewable even when GitHub cannot render the raw diff.
|
|
"""
|
|
|
|
status_counts = Counter(change.get("status", "modified") for change in po_files)
|
|
total_files = len(po_files)
|
|
added_files = status_counts["added"]
|
|
removed_files = status_counts["removed"]
|
|
reviewable_language_reports = [
|
|
report for report in language_reports if not should_hide_report_from_review(report)
|
|
]
|
|
|
|
grouped_files_count = sum(len(group["files"]) for group in similar_groups)
|
|
translation_change_count = sum(
|
|
len(report["changes"]) for report in reviewable_language_reports if report["changes"]
|
|
)
|
|
changed_languages_count = sum(1 for report in reviewable_language_reports if report["changes"])
|
|
removed_reports = [report for report in reviewable_language_reports if report["status"] == "removed"]
|
|
metadata_only_reports = [
|
|
report
|
|
for report in reviewable_language_reports
|
|
if not report["changes"] and report["status"] != "removed"
|
|
]
|
|
|
|
lines = [
|
|
COMMENT_MARKER,
|
|
"Here is a summary of the `.po` file changes:",
|
|
"",
|
|
f"- Changed files: `{total_files}`",
|
|
f"- Added files: `{added_files}`",
|
|
f"- Removed files: `{removed_files}`",
|
|
f"- Files in similar change-size groups within 2% tolerance: `{grouped_files_count}`",
|
|
f"- Added or changed translations detected: `{translation_change_count}` across `{changed_languages_count}` file(s)",
|
|
]
|
|
|
|
if parse_errors:
|
|
lines.append(f"- Files that could not be parsed: `{len(parse_errors)}`")
|
|
|
|
lines.extend(["", "### Similar Change-Size Groups", ""])
|
|
|
|
if similar_groups:
|
|
for group in similar_groups:
|
|
representative_additions = round(group["avg_additions"])
|
|
representative_deletions = round(group["avg_deletions"])
|
|
file_names = ", ".join(f"`{Path(file['filename']).name}`" for file in group["files"])
|
|
lines.append(
|
|
f"- Around `+{representative_additions} / -{representative_deletions}` lines: "
|
|
f"`{len(group['files'])}` files ({file_names})"
|
|
)
|
|
else:
|
|
lines.append("- No repeated change-size groups were found within the 2% tolerance.")
|
|
|
|
lines.extend(
|
|
[
|
|
"",
|
|
"<details>",
|
|
f"<summary>Added or changed translations by language ({translation_change_count} entries across {changed_languages_count} file(s))</summary>",
|
|
"",
|
|
]
|
|
)
|
|
|
|
if translation_change_count:
|
|
for report in reviewable_language_reports:
|
|
if not report["changes"]:
|
|
continue
|
|
lines.extend(build_language_section(report))
|
|
else:
|
|
lines.extend(
|
|
[
|
|
"No added or changed translations were detected. The `.po` changes appear to be metadata, comment, or source reference updates only.",
|
|
"",
|
|
]
|
|
)
|
|
|
|
if metadata_only_reports:
|
|
lines.extend(["### Metadata-Only File Changes", ""])
|
|
for report in metadata_only_reports:
|
|
lines.append(f"- `{report['language']}` (`{report['path']}`)")
|
|
lines.append("")
|
|
|
|
if removed_reports:
|
|
lines.extend(["### Removed Translation Files", ""])
|
|
for report in removed_reports:
|
|
lines.append(f"- `{report['language']}` (`{report['path']}`)")
|
|
lines.append("")
|
|
|
|
if parse_errors:
|
|
lines.extend(["### Parse Errors", ""])
|
|
for error in parse_errors:
|
|
lines.append(f"- `{error['path']}`: {html.escape(error['error'])}")
|
|
lines.append("")
|
|
|
|
lines.append("</details>")
|
|
lines.append("")
|
|
|
|
return "\n".join(lines)
|
|
|
|
|
|
def build_language_report(
|
|
repo: str,
|
|
change: dict[str, Any],
|
|
head_sha: str,
|
|
) -> tuple[dict[str, Any] | None, dict[str, str] | None]:
|
|
"""Compare one changed `.po` file between the base checkout and PR head blob.
|
|
|
|
The base side is read from the trusted local checkout, while the head side is
|
|
fetched by SHA from GitHub so the workflow does not have to execute PR code.
|
|
"""
|
|
|
|
base_path = base_path_for_file(change)
|
|
head_path = head_path_for_file(change)
|
|
base_po_path = base_path if (base_path or "").endswith(".po") else None
|
|
head_po_path = head_path if (head_path or "").endswith(".po") else None
|
|
display_path = head_path or base_path or change.get("filename")
|
|
|
|
try:
|
|
base_content = read_local_file(base_po_path)
|
|
head_content = fetch_file_content(repo, head_po_path, head_sha)
|
|
|
|
base_language, base_entries = load_translation_entries(base_content)
|
|
head_language, head_entries = load_translation_entries(head_content)
|
|
language = head_language or base_language or Path(display_path).stem
|
|
|
|
return (
|
|
{
|
|
"language": language,
|
|
"path": display_path,
|
|
"status": change.get("status"),
|
|
"changes": compare_entries(base_entries, head_entries),
|
|
},
|
|
None,
|
|
)
|
|
except Exception as exc:
|
|
return None, {"path": display_path, "error": str(exc)}
|
|
|
|
|
|
def main() -> None:
|
|
"""Generate the comment body for the current PR and write it to disk."""
|
|
|
|
args = parse_args()
|
|
if not args.repo or not args.pr or not args.head_sha:
|
|
raise SystemExit("Missing required pull request context.")
|
|
|
|
all_files = fetch_pr_files(args.repo, args.pr)
|
|
po_files = [change for change in all_files if is_po_file(change)]
|
|
language_reports: list[dict[str, Any]] = []
|
|
parse_errors: list[dict[str, str]] = []
|
|
|
|
for change in po_files:
|
|
report, error = build_language_report(args.repo, change, args.head_sha)
|
|
if report:
|
|
language_reports.append(report)
|
|
if error:
|
|
parse_errors.append(error)
|
|
|
|
language_reports.sort(key=lambda report: (str(report["language"]).lower(), str(report["path"]).lower()))
|
|
comment = build_comment(po_files, language_reports, cluster_similar_change_sizes(po_files), parse_errors)
|
|
Path(args.output).write_text(comment, encoding="utf-8")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|