Merge pull request #25659 from marination/wkhtml-options

fix: Use CssParser to correctly pass options to wkhtmltopdf
2024-03-27 17:15:36 +05:30 · 2024-03-27 17:15:36 +05:30 · 722bb3b0fc
commit 722bb3b0fc
parent 59b95a4d19 96667b1bab
3 changed files with 69 additions and 13 deletions
--- a/frappe/tests/test_pdf.py
+++ b/frappe/tests/test_pdf.py
@ -37,9 +37,32 @@ class TestPdf(FrappeTestCase):
 	def test_read_options_from_html(self):
 		_, html_options = pdfgen.read_options_from_html(self.html)
 		self.assertTrue(html_options["margin-top"] == "0")
-		self.assertTrue(html_options["margin-left"] == "10")
+		self.assertTrue(html_options["margin-left"] == "10mm")
 		self.assertTrue(html_options["margin-right"] == "0")

+		html_1 = """<style>
+			.print-format {
+				margin-top: 0mm;
+				margin-left: 10mm;
+			}
+			.print-format .more-info {
+				margin-right: 15mm;
+			}
+			.print-format, .more-info {
+				margin-bottom: 20mm;
+			}
+			</style>
+			<div class="more-info">Hello</div>
+		"""
+		_, options = pdfgen.read_options_from_html(html_1)
+
+		self.assertTrue(options["margin-top"] == "0")
+		self.assertTrue(options["margin-left"] == "10mm")
+		self.assertTrue(options["margin-bottom"] == "20mm")
+		# margin-right was for .more-info (child of .print-format)
+		# so it should not be extracted into options
+		self.assertFalse(options.get("margin-right"))
+
 	def test_pdf_encryption(self):
 		password = "qwe"
 		pdf = pdfgen.get_pdf(self.html, options={"password": password})
--- a/frappe/utils/pdf.py
+++ b/frappe/utils/pdf.py
@ -5,10 +5,10 @@ import contextlib
 import io
 import mimetypes
 import os
-import re
 import subprocess
 from urllib.parse import parse_qs, urlparse

+import cssutils
 import pdfkit
 from bs4 import BeautifulSoup
 from packaging.version import Version
@ -206,8 +206,9 @@ def read_options_from_html(html):

 	toggle_visible_pdf(soup)

-	# use regex instead of soup-parser
-	for attr in (
+	valid_styles = get_print_format_styles(soup)
+
+	attrs = (
 		"margin-top",
 		"margin-bottom",
 		"margin-left",
@ -217,18 +218,49 @@ def read_options_from_html(html):
 		"orientation",
 		"page-width",
 		"page-height",
-	):
-		try:
-			pattern = re.compile(r"(\.print-format)([\S|\s][^}]*?)(" + str(attr) + r":)(.+)(mm;)")
-			match = pattern.findall(html)
-			if match:
-				options[attr] = str(match[-1][3]).strip()
-		except Exception:
-			pass
-
+	)
+	options |= {style.name: style.value for style in valid_styles if style.name in attrs}
 	return str(soup), options


+def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]:
+	"""
+	Get styles purely on class 'print-format'.
+	Valid:
+	1) .print-format { ... }
+	2) .print-format, p { ... } | p, .print-format { ... }
+
+	Invalid (applied on child elements):
+	1) .print-format p { ... } | .print-format > p { ... }
+	2) .print-format #abc { ... }
+
+	Returns:
+	[cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...]
+	"""
+	stylesheet = ""
+	style_tags = soup.find_all("style")
+
+	# Prepare a css stylesheet from all the style tags' contents
+	for style_tag in style_tags:
+		stylesheet += style_tag.string
+
+	# Use css parser to tokenize the classes and their styles
+	parsed_sheet = cssutils.parseString(stylesheet)
+
+	# Get all styles that are only for .print-format
+	valid_styles = []
+	for rule in parsed_sheet:
+		if not isinstance(rule, cssutils.css.CSSStyleRule):
+			continue
+
+		# Allow only .print-format { ... } and .print-format, p { ... }
+		# Disallow .print-format p { ... } and .print-format > p { ... }
+		if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]:
+			valid_styles.extend(entry for entry in rule.style)
+
+	return valid_styles
+
+
 def inline_private_images(html) -> str:
 	soup = BeautifulSoup(html, "html.parser")
 	for img in soup.find_all("img"):
--- a/pyproject.toml
+++ b/pyproject.toml
@ -34,6 +34,7 @@ dependencies = [
    "chardet~=5.1.0",
    "croniter~=2.0.1",
    "cryptography~=42.0.0",
+    "cssutils~=2.9.0",
    "email-reply-parser~=0.5.12",
    "gunicorn~=21.2.0",
    "html5lib~=1.1",