diff --git a/frappe/tests/test_pdf.py b/frappe/tests/test_pdf.py index fb68cdffe4..4ae8e8ff3c 100644 --- a/frappe/tests/test_pdf.py +++ b/frappe/tests/test_pdf.py @@ -37,9 +37,32 @@ class TestPdf(FrappeTestCase): def test_read_options_from_html(self): _, html_options = pdfgen.read_options_from_html(self.html) self.assertTrue(html_options["margin-top"] == "0") - self.assertTrue(html_options["margin-left"] == "10") + self.assertTrue(html_options["margin-left"] == "10mm") self.assertTrue(html_options["margin-right"] == "0") + html_1 = """ +
Hello
+ """ + _, options = pdfgen.read_options_from_html(html_1) + + self.assertTrue(options["margin-top"] == "0") + self.assertTrue(options["margin-left"] == "10mm") + self.assertTrue(options["margin-bottom"] == "20mm") + # margin-right was for .more-info (child of .print-format) + # so it should not be extracted into options + self.assertFalse(options.get("margin-right")) + def test_pdf_encryption(self): password = "qwe" pdf = pdfgen.get_pdf(self.html, options={"password": password}) diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index bed32987d1..7325f87201 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -5,10 +5,10 @@ import contextlib import io import mimetypes import os -import re import subprocess from urllib.parse import parse_qs, urlparse +import cssutils import pdfkit from bs4 import BeautifulSoup from packaging.version import Version @@ -206,8 +206,9 @@ def read_options_from_html(html): toggle_visible_pdf(soup) - # use regex instead of soup-parser - for attr in ( + valid_styles = get_print_format_styles(soup) + + attrs = ( "margin-top", "margin-bottom", "margin-left", @@ -217,18 +218,49 @@ def read_options_from_html(html): "orientation", "page-width", "page-height", - ): - try: - pattern = re.compile(r"(\.print-format)([\S|\s][^}]*?)(" + str(attr) + r":)(.+)(mm;)") - match = pattern.findall(html) - if match: - options[attr] = str(match[-1][3]).strip() - except Exception: - pass - + ) + options |= {style.name: style.value for style in valid_styles if style.name in attrs} return str(soup), options +def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]: + """ + Get styles purely on class 'print-format'. + Valid: + 1) .print-format { ... } + 2) .print-format, p { ... } | p, .print-format { ... } + + Invalid (applied on child elements): + 1) .print-format p { ... } | .print-format > p { ... } + 2) .print-format #abc { ... } + + Returns: + [cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...] + """ + stylesheet = "" + style_tags = soup.find_all("style") + + # Prepare a css stylesheet from all the style tags' contents + for style_tag in style_tags: + stylesheet += style_tag.string + + # Use css parser to tokenize the classes and their styles + parsed_sheet = cssutils.parseString(stylesheet) + + # Get all styles that are only for .print-format + valid_styles = [] + for rule in parsed_sheet: + if not isinstance(rule, cssutils.css.CSSStyleRule): + continue + + # Allow only .print-format { ... } and .print-format, p { ... } + # Disallow .print-format p { ... } and .print-format > p { ... } + if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]: + valid_styles.extend(entry for entry in rule.style) + + return valid_styles + + def inline_private_images(html) -> str: soup = BeautifulSoup(html, "html.parser") for img in soup.find_all("img"): diff --git a/pyproject.toml b/pyproject.toml index 2e2a7948e6..14289d1a72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "chardet~=5.1.0", "croniter~=2.0.1", "cryptography~=42.0.0", + "cssutils~=2.9.0", "email-reply-parser~=0.5.12", "gunicorn~=21.2.0", "html5lib~=1.1",