# Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors # License: MIT. See LICENSE import base64 import contextlib import io import mimetypes import os import subprocess from urllib.parse import parse_qs, urlparse import cssutils import pdfkit pdfkit.source.unicode = str # NOTE: upstream bug; PYTHONOPTIMIZE=1 optimized this away from bs4 import BeautifulSoup from packaging.version import Version from pypdf import PdfReader, PdfWriter, errors import frappe from frappe import _ from frappe.core.doctype.file.utils import find_file_by_url from frappe.utils import cstr, scrub_urls from frappe.utils.caching import redis_cache from frappe.utils.data import get_url from frappe.utils.jinja_globals import bundled_asset, is_rtl cssutils.log.setLog(frappe.logger("cssutils")) PDF_CONTENT_ERRORS = [ "ContentNotFoundError", "ContentOperationNotPermittedError", "UnknownContentError", "RemoteHostClosedError", ] def pdf_header_html(soup, head, content, styles, html_id, css, path=None): if not path: path = "templates/print_formats/pdf_header_footer.html" return frappe.render_template( path, { "head": head, "content": content, "styles": styles, "html_id": html_id, "css": css, "lang": frappe.local.lang, "layout_direction": "rtl" if is_rtl() else "ltr", }, ) def pdf_body_html(template, args, **kwargs): try: return template.render(args, filters={"len": len}) except Exception as e: # Guess line number ? frappe.throw( _("Error in print format on line {0}: {1}").format( _guess_template_error_line_number(template), e ), exc=frappe.PrintFormatError, title=_("Print Format Error"), ) def _guess_template_error_line_number(template) -> int | None: """Guess line on which exception occurred from current traceback.""" with contextlib.suppress(Exception): import sys import traceback _, _, tb = sys.exc_info() for frame in reversed(traceback.extract_tb(tb)): if template.filename in frame.filename: return frame.lineno def pdf_footer_html(soup, head, content, styles, html_id, css, path=None): return pdf_header_html( soup=soup, head=head, content=content, styles=styles, html_id=html_id, css=css, path=path ) def get_pdf(html, options=None, output: PdfWriter | None = None): html = scrub_urls(html) html, options = prepare_options(html, options) options.update({"disable-javascript": "", "disable-local-file-access": ""}) filedata = "" if Version(get_wkhtmltopdf_version()) > Version("0.12.3"): options.update({"disable-smart-shrinking": ""}) try: # Set filename property to false, so no file is actually created filedata = pdfkit.from_string(html, options=options or {}, verbose=True) # create in-memory binary streams from filedata and create a PdfReader object reader = PdfReader(io.BytesIO(filedata)) except OSError as e: if any([error in str(e) for error in PDF_CONTENT_ERRORS]): if not filedata: print(html, options) frappe.throw(_("PDF generation failed because of broken image links")) # allow pdfs with missing images if file got created if output: output.append_pages_from_reader(reader) else: raise finally: cleanup(options) if "password" in options: password = options["password"] if output: output.append_pages_from_reader(reader) return output writer = PdfWriter() writer.append_pages_from_reader(reader) if "password" in options: writer.encrypt(password) filedata = get_file_data_from_writer(writer) return filedata def measure_time(func): import time def wrapper(*args, **kwargs): start_time = time.time() result = func(*args, **kwargs) end_time = time.time() print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds") return result return wrapper @measure_time def get_chrome_pdf(print_format, html, options, output, pdf_generator=None): from frappe.utils.pdf_generator.browser import Browser from frappe.utils.pdf_generator.chrome_pdf_generator import ChromePDFGenerator from frappe.utils.pdf_generator.pdf_merge import PDFTransformer if pdf_generator != "chrome": # Use the default pdf generator return # scrubbing url to expand url is not required as we have set url. # also, planning to remove network requests anyway 🤞 generator = ChromePDFGenerator() browser = Browser(generator, print_format, html, options) transformer = PDFTransformer(browser) # transforms and merges header, footer into body pdf and returns merged pdf return transformer.transform_pdf(output=output) def get_file_data_from_writer(writer_obj): # https://docs.python.org/3/library/io.html stream = io.BytesIO() writer_obj.write(stream) # Change the stream position to start of the stream stream.seek(0) # Read up to size bytes from the object and return them return stream.read() def prepare_options(html, options): if not options: options = {} options.update( { "print-media-type": None, "background": None, "images": None, "quiet": None, # 'no-outline': None, "encoding": "UTF-8", # 'load-error-handling': 'ignore' } ) if not options.get("margin-right"): options["margin-right"] = "15mm" if not options.get("margin-left"): options["margin-left"] = "15mm" html, html_options = read_options_from_html(html) options.update(html_options or {}) # cookies options.update(get_cookie_options()) html = inline_private_images(html) # page size pdf_page_size = ( options.get("page-size") or frappe.db.get_single_value("Print Settings", "pdf_page_size") or "A4" ) if pdf_page_size == "Custom": options["page-height"] = options.get("page-height") or frappe.db.get_single_value( "Print Settings", "pdf_page_height" ) options["page-width"] = options.get("page-width") or frappe.db.get_single_value( "Print Settings", "pdf_page_width" ) else: options["page-size"] = pdf_page_size return html, options def get_cookie_options(): options = {} if frappe.session and frappe.session.sid and hasattr(frappe.local, "request"): # Use wkhtmltopdf's cookie-jar feature to set cookies and restrict them to host domain cookiejar = f"/tmp/{frappe.generate_hash()}.jar" # Remove port from request.host # https://werkzeug.palletsprojects.com/en/0.16.x/wrappers/#werkzeug.wrappers.BaseRequest.host domain = frappe.utils.get_host_name().split(":", 1)[0] with open(cookiejar, "w") as f: f.write(f"sid={frappe.session.sid}; Domain={domain};\n") options["cookie-jar"] = cookiejar return options def read_options_from_html(html): options = {} soup = BeautifulSoup(html, "html5lib") options.update(prepare_header_footer(soup)) toggle_visible_pdf(soup) valid_styles = get_print_format_styles(soup) attrs = ( "margin-top", "margin-bottom", "margin-left", "margin-right", "page-size", "header-spacing", "orientation", "page-width", "page-height", ) options |= {style.name: style.value for style in valid_styles if style.name in attrs} return str(soup), options def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]: """ Get styles purely on class 'print-format'. Valid: 1) .print-format { ... } 2) .print-format, p { ... } | p, .print-format { ... } Invalid (applied on child elements): 1) .print-format p { ... } | .print-format > p { ... } 2) .print-format #abc { ... } Returns: [cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...] """ stylesheet = "" style_tags = soup.find_all("style") # Prepare a css stylesheet from all the style tags' contents for style_tag in style_tags: stylesheet += cstr(style_tag.string) # Use css parser to tokenize the classes and their styles parsed_sheet = cssutils.parseString(stylesheet) # Get all styles that are only for .print-format valid_styles = [] for rule in parsed_sheet: if not isinstance(rule, cssutils.css.CSSStyleRule): continue # Allow only .print-format { ... } and .print-format, p { ... } # Disallow .print-format p { ... } and .print-format > p { ... } if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]: valid_styles.extend(entry for entry in rule.style) return valid_styles def inline_private_images(html) -> str: soup = BeautifulSoup(html, "html.parser") for img in soup.find_all("img"): if b64 := _get_base64_image(img["src"]): img["src"] = b64 return str(soup) def _get_base64_image(src): """Return base64 version of image if user has permission to view it""" try: parsed_url = urlparse(src) path = parsed_url.path query = parse_qs(parsed_url.query) mime_type = mimetypes.guess_type(path)[0] if mime_type is None or not mime_type.startswith("image/"): return filename = (query.get("fid") and query["fid"][0]) or None file = find_file_by_url(path, name=filename) if not file or not file.is_private: return b64_encoded_image = base64.b64encode(file.get_content()).decode() return f"data:{mime_type};base64,{b64_encoded_image}" except Exception: frappe.logger("pdf").error("Failed to convert inline images to base64", exc_info=True) def prepare_header_footer(soup: BeautifulSoup): options = {} head = soup.find("head").contents styles = soup.find_all("style") print_css = bundled_asset("print.bundle.css").lstrip("/") css = frappe.read_file(os.path.join(frappe.local.sites_path, print_css)) # extract header and footer for html_id in ("header-html", "footer-html"): if content := soup.find(id=html_id): content = content.extract() # `header/footer-html` are extracted, rendered as html # and passed in wkhtmltopdf options (as '--header/footer-html') # Remove instances of them from main content for render_template for tag in soup.find_all(id=html_id): tag.extract() toggle_visible_pdf(content) id_map = {"header-html": "pdf_header_html", "footer-html": "pdf_footer_html"} hook_func = frappe.get_hooks(id_map.get(html_id)) html = frappe.call( hook_func[-1], soup=soup, head=head, content=content, styles=styles, html_id=html_id, css=css, ) # create temp file fname = os.path.join("/tmp", f"frappe-pdf-{frappe.generate_hash()}.html") with open(fname, "wb") as f: f.write(html.encode("utf-8")) # {"header-html": "/tmp/frappe-pdf-random.html"} options[html_id] = fname else: if html_id == "header-html": options["margin-top"] = "15mm" elif html_id == "footer-html": options["margin-bottom"] = "15mm" return options def cleanup(options): for key in ("header-html", "footer-html", "cookie-jar"): if options.get(key) and os.path.exists(options[key]): os.remove(options[key]) def toggle_visible_pdf(soup): for tag in soup.find_all(attrs={"class": "visible-pdf"}): # remove visible-pdf class to unhide tag.attrs["class"].remove("visible-pdf") for tag in soup.find_all(attrs={"class": "hidden-pdf"}): # remove tag from html tag.extract() @frappe.whitelist() @redis_cache(ttl=60 * 60) def is_wkhtmltopdf_valid(): try: output = subprocess.check_output(["wkhtmltopdf", "--version"]) return "qt" in output.decode("utf-8").lower() except Exception: return False def get_wkhtmltopdf_version(): wkhtmltopdf_version = frappe.cache.hget("wkhtmltopdf_version", None) if not wkhtmltopdf_version: try: res = subprocess.check_output(["wkhtmltopdf", "--version"]) wkhtmltopdf_version = res.decode("utf-8").split(" ")[1] frappe.cache.hset("wkhtmltopdf_version", None, wkhtmltopdf_version) except Exception: pass return wkhtmltopdf_version or "0" def pdf_contains_js(file_content: bytes): """ Check if a PDF file contains JavaScript. Args: file_content (bytes): The content of the PDF file. Returns: bool: True if the PDF contains JavaScript, False otherwise and also if the file is encrypted. """ from io import BytesIO reader = PdfReader(BytesIO(file_content)) def has_javascript(obj): if isinstance(obj, dict): for key, value in obj.items(): if key in ("/JS", "/JavaScript"): return True if has_javascript(value): return True elif isinstance(obj, list): for item in obj: if has_javascript(item): return True return False root = reader.trailer.get("/Root", {}) if has_javascript(root): return True try: for page in reader.pages: if has_javascript(page): return True except errors.FileNotDecryptedError: pass return False def get_host_url(): if frappe.request: return frappe.request.host_url else: return get_url() + "/"