From 5dbcbbb915bdbc8e3b895c71333d4ec683073067 Mon Sep 17 00:00:00 2001 From: marination Date: Tue, 26 Mar 2024 10:33:35 +0100 Subject: [PATCH] fix: Use CssParser to correctly pass options to wkhtmltopdf - Regex incorrectly fetches .print-format's child styles and also extracts the wrong attribute value - A CssParser is more maintainable and more readable as well as less prone to errors while extracting values - Method: We extract style tag contents out of the html and tokenize them. We then filter the styles for the right selector and extract the attributes we want from them. - This way we make sure that the right value is extracted and only the ones applicable to .print-format directly --- frappe/utils/pdf.py | 53 +++++++++++++++++++++++++++++++++++++-------- pyproject.toml | 1 + 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index bed32987d1..7844f1612a 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -5,10 +5,10 @@ import contextlib import io import mimetypes import os -import re import subprocess from urllib.parse import parse_qs, urlparse +import cssutils import pdfkit from bs4 import BeautifulSoup from packaging.version import Version @@ -206,7 +206,8 @@ def read_options_from_html(html): toggle_visible_pdf(soup) - # use regex instead of soup-parser + valid_styles = get_print_format_styles(soup) + for attr in ( "margin-top", "margin-bottom", @@ -218,17 +219,51 @@ def read_options_from_html(html): "page-width", "page-height", ): - try: - pattern = re.compile(r"(\.print-format)([\S|\s][^}]*?)(" + str(attr) + r":)(.+)(mm;)") - match = pattern.findall(html) - if match: - options[attr] = str(match[-1][3]).strip() - except Exception: - pass + for style in valid_styles: + if attr == style.name: + options[attr] = style.value return str(soup), options +def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]: + """ + Get styles purely on class 'print-format'. + Valid: + 1) .print-format { ... } + 2) .print-format, p { ... } | p, .print-format { ... } + + Invalid (applied on child elements): + 1) .print-format p { ... } | .print-format > p { ... } + 2) .print-format #abc { ... } + + Returns: + [cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...] + """ + stylesheet = "" + style_tags = soup.find_all("style") + + # Prepare a css stylesheet from all the style tags' contents + for style_tag in style_tags: + stylesheet += style_tag.string + + # Use css parser to tokenize the classes and their styles + parsed_sheet = cssutils.parseString(stylesheet) + + # Get all styles that are only for .print-format + valid_styles = [] + for rule in parsed_sheet: + if not isinstance(rule, cssutils.css.CSSStyleRule): + continue + + # Allow only .print-format { ... } and .print-format, p { ... } + # Disallow .print-format p { ... } and .print-format > p { ... } + if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]: + valid_styles.extend(entry for entry in rule.style) + + return valid_styles + + def inline_private_images(html) -> str: soup = BeautifulSoup(html, "html.parser") for img in soup.find_all("img"): diff --git a/pyproject.toml b/pyproject.toml index 2e2a7948e6..14289d1a72 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,6 +34,7 @@ dependencies = [ "chardet~=5.1.0", "croniter~=2.0.1", "cryptography~=42.0.0", + "cssutils~=2.9.0", "email-reply-parser~=0.5.12", "gunicorn~=21.2.0", "html5lib~=1.1",