seitime-frappe/frappe/utils/pdf.py

# Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
import base64
import contextlib
import io
import mimetypes
import os
import subprocess
from urllib.parse import parse_qs, urlparse

import cssutils
import pdfkit

pdfkit.source.unicode = str  # NOTE: upstream bug; PYTHONOPTIMIZE=1 optimized this away
from bs4 import BeautifulSoup
from packaging.version import Version
from pypdf import PdfReader, PdfWriter, errors

import frappe
from frappe import _
from frappe.core.doctype.file.utils import find_file_by_url
from frappe.utils import cstr, scrub_urls
from frappe.utils.caching import redis_cache
from frappe.utils.jinja_globals import bundled_asset, is_rtl

cssutils.log.setLog(frappe.logger("cssutils"))

PDF_CONTENT_ERRORS = [
	"ContentNotFoundError",
	"ContentOperationNotPermittedError",
	"UnknownContentError",
	"RemoteHostClosedError",
]


def pdf_header_html(soup, head, content, styles, html_id, css, path=None):
	if not path:
		path = "templates/print_formats/pdf_header_footer.html"
	return frappe.render_template(
		path,
		{
			"head": head,
			"content": content,
			"styles": styles,
			"html_id": html_id,
			"css": css,
			"lang": frappe.local.lang,
			"layout_direction": "rtl" if is_rtl() else "ltr",
		},
	)


def pdf_body_html(template, args, **kwargs):
	try:
		return template.render(args, filters={"len": len})
	except Exception as e:
		# Guess line number ?
		frappe.throw(
			_("Error in print format on line {0}: {1}").format(
				_guess_template_error_line_number(template), e
			),
			exc=frappe.PrintFormatError,
			title=_("Print Format Error"),
		)


def _guess_template_error_line_number(template) -> int | None:
	"""Guess line on which exception occurred from current traceback."""
	with contextlib.suppress(Exception):
		import sys
		import traceback

		_, _, tb = sys.exc_info()

		for frame in reversed(traceback.extract_tb(tb)):
			if template.filename in frame.filename:
				return frame.lineno


def pdf_footer_html(soup, head, content, styles, html_id, css, path=None):
	return pdf_header_html(
		soup=soup, head=head, content=content, styles=styles, html_id=html_id, css=css, path=path
	)


def get_pdf(html, options=None, output: PdfWriter | None = None):
	html = scrub_urls(html)
	html, options = prepare_options(html, options)

	options.update({"disable-javascript": "", "disable-local-file-access": ""})

	filedata = ""
	if Version(get_wkhtmltopdf_version()) > Version("0.12.3"):
		options.update({"disable-smart-shrinking": ""})

	try:
		# Set filename property to false, so no file is actually created
		filedata = pdfkit.from_string(html, options=options or {}, verbose=True)

		# create in-memory binary streams from filedata and create a PdfReader object
		reader = PdfReader(io.BytesIO(filedata))
	except OSError as e:
		if any([error in str(e) for error in PDF_CONTENT_ERRORS]):
			if not filedata:
				print(html, options)
				frappe.throw(_("PDF generation failed because of broken image links"))

			# allow pdfs with missing images if file got created
			if output:
				output.append_pages_from_reader(reader)
		else:
			raise
	finally:
		cleanup(options)

	if "password" in options:
		password = options["password"]

	if output:
		output.append_pages_from_reader(reader)
		return output

	writer = PdfWriter()
	writer.append_pages_from_reader(reader)

	if "password" in options:
		writer.encrypt(password)

	filedata = get_file_data_from_writer(writer)

	return filedata


def measure_time(func):
	import time

	def wrapper(*args, **kwargs):
		start_time = time.time()
		result = func(*args, **kwargs)
		end_time = time.time()
		print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds")
		return result

	return wrapper


@measure_time
def get_chrome_pdf(print_format, html, options, output, pdf_generator=None):
	from frappe.utils.pdf_generator.browser import Browser
	from frappe.utils.pdf_generator.chrome_pdf_generator import ChromePDFGenerator
	from frappe.utils.pdf_generator.pdf_merge import PDFTransformer

	if pdf_generator != "chrome":
		# Use the default pdf generator
		return
	# scrubbing url to expand url is not required as we have set url.
	# also, planning to remove network requests anyway 🤞
	generator = ChromePDFGenerator()
	browser = Browser(generator, print_format, html, options)
	transformer = PDFTransformer(browser)
	# transforms and merges header, footer into body pdf and returns merged pdf
	return transformer.transform_pdf(output=output)


def get_file_data_from_writer(writer_obj):
	# https://docs.python.org/3/library/io.html
	stream = io.BytesIO()
	writer_obj.write(stream)

	# Change the stream position to start of the stream
	stream.seek(0)

	# Read up to size bytes from the object and return them
	return stream.read()


def prepare_options(html, options):
	if not options:
		options = {}

	options.update(
		{
			"print-media-type": None,
			"background": None,
			"images": None,
			"quiet": None,
			# 'no-outline': None,
			"encoding": "UTF-8",
			# 'load-error-handling': 'ignore'
		}
	)

	if not options.get("margin-right"):
		options["margin-right"] = "15mm"

	if not options.get("margin-left"):
		options["margin-left"] = "15mm"

	html, html_options = read_options_from_html(html)
	options.update(html_options or {})

	# cookies
	options.update(get_cookie_options())
	html = inline_private_images(html)

	# page size
	pdf_page_size = (
		options.get("page-size") or frappe.db.get_single_value("Print Settings", "pdf_page_size") or "A4"
	)

	if pdf_page_size == "Custom":
		options["page-height"] = options.get("page-height") or frappe.db.get_single_value(
			"Print Settings", "pdf_page_height"
		)
		options["page-width"] = options.get("page-width") or frappe.db.get_single_value(
			"Print Settings", "pdf_page_width"
		)
	else:
		options["page-size"] = pdf_page_size

	return html, options


def get_cookie_options():
	options = {}
	if frappe.session and frappe.session.sid and hasattr(frappe.local, "request"):
		# Use wkhtmltopdf's cookie-jar feature to set cookies and restrict them to host domain
		cookiejar = f"/tmp/{frappe.generate_hash()}.jar"

		# Remove port from request.host
		# https://werkzeug.palletsprojects.com/en/0.16.x/wrappers/#werkzeug.wrappers.BaseRequest.host
		domain = frappe.utils.get_host_name().split(":", 1)[0]
		with open(cookiejar, "w") as f:
			f.write(f"sid={frappe.session.sid}; Domain={domain};\n")

		options["cookie-jar"] = cookiejar

	return options


def read_options_from_html(html):
	options = {}
	soup = BeautifulSoup(html, "html5lib")

	options.update(prepare_header_footer(soup))

	toggle_visible_pdf(soup)

	valid_styles = get_print_format_styles(soup)

	attrs = (
		"margin-top",
		"margin-bottom",
		"margin-left",
		"margin-right",
		"page-size",
		"header-spacing",
		"orientation",
		"page-width",
		"page-height",
	)
	options |= {style.name: style.value for style in valid_styles if style.name in attrs}
	return str(soup), options


def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]:
	"""
	Get styles purely on class 'print-format'.
	Valid:
	1) .print-format { ... }
	2) .print-format, p { ... } | p, .print-format { ... }

	Invalid (applied on child elements):
	1) .print-format p { ... } | .print-format > p { ... }
	2) .print-format #abc { ... }

	Returns:
	[cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...]
	"""
	stylesheet = ""
	style_tags = soup.find_all("style")

	# Prepare a css stylesheet from all the style tags' contents
	for style_tag in style_tags:
		stylesheet += cstr(style_tag.string)

	# Use css parser to tokenize the classes and their styles
	parsed_sheet = cssutils.parseString(stylesheet)

	# Get all styles that are only for .print-format
	valid_styles = []
	for rule in parsed_sheet:
		if not isinstance(rule, cssutils.css.CSSStyleRule):
			continue

		# Allow only .print-format { ... } and .print-format, p { ... }
		# Disallow .print-format p { ... } and .print-format > p { ... }
		if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]:
			valid_styles.extend(entry for entry in rule.style)

	return valid_styles


def inline_private_images(html) -> str:
	soup = BeautifulSoup(html, "html.parser")
	for img in soup.find_all("img"):
		if b64 := _get_base64_image(img["src"]):
			img["src"] = b64
	return str(soup)


def _get_base64_image(src):
	"""Return base64 version of image if user has permission to view it"""
	try:
		parsed_url = urlparse(src)
		path = parsed_url.path
		query = parse_qs(parsed_url.query)
		mime_type = mimetypes.guess_type(path)[0]
		if mime_type is None or not mime_type.startswith("image/"):
			return
		filename = (query.get("fid") and query["fid"][0]) or None
		file = find_file_by_url(path, name=filename)
		if not file or not file.is_private:
			return

		b64_encoded_image = base64.b64encode(file.get_content()).decode()
		return f"data:{mime_type};base64,{b64_encoded_image}"
	except Exception:
		frappe.logger("pdf").error("Failed to convert inline images to base64", exc_info=True)


def prepare_header_footer(soup: BeautifulSoup):
	options = {}

	head = soup.find("head").contents
	styles = soup.find_all("style")

	print_css = bundled_asset("print.bundle.css").lstrip("/")
	css = frappe.read_file(os.path.join(frappe.local.sites_path, print_css))

	# extract header and footer
	for html_id in ("header-html", "footer-html"):
		if content := soup.find(id=html_id):
			content = content.extract()
			# `header/footer-html` are extracted, rendered as html
			# and passed in wkhtmltopdf options (as '--header/footer-html')
			# Remove instances of them from main content for render_template
			for tag in soup.find_all(id=html_id):
				tag.extract()

			toggle_visible_pdf(content)
			id_map = {"header-html": "pdf_header_html", "footer-html": "pdf_footer_html"}
			hook_func = frappe.get_hooks(id_map.get(html_id))
			html = frappe.call(
				hook_func[-1],
				soup=soup,
				head=head,
				content=content,
				styles=styles,
				html_id=html_id,
				css=css,
			)

			# create temp file
			fname = os.path.join("/tmp", f"frappe-pdf-{frappe.generate_hash()}.html")
			with open(fname, "wb") as f:
				f.write(html.encode("utf-8"))

			# {"header-html": "/tmp/frappe-pdf-random.html"}
			options[html_id] = fname
		else:
			if html_id == "header-html":
				options["margin-top"] = "15mm"
			elif html_id == "footer-html":
				options["margin-bottom"] = "15mm"

	return options


def cleanup(options):
	for key in ("header-html", "footer-html", "cookie-jar"):
		if options.get(key) and os.path.exists(options[key]):
			os.remove(options[key])


def toggle_visible_pdf(soup):
	for tag in soup.find_all(attrs={"class": "visible-pdf"}):
		# remove visible-pdf class to unhide
		tag.attrs["class"].remove("visible-pdf")

	for tag in soup.find_all(attrs={"class": "hidden-pdf"}):
		# remove tag from html
		tag.extract()


@frappe.whitelist()
@redis_cache(ttl=60 * 60)
def is_wkhtmltopdf_valid():
	try:
		output = subprocess.check_output(["wkhtmltopdf", "--version"])
		return "qt" in output.decode("utf-8").lower()
	except Exception:
		return False


def get_wkhtmltopdf_version():
	wkhtmltopdf_version = frappe.cache.hget("wkhtmltopdf_version", None)

	if not wkhtmltopdf_version:
		try:
			res = subprocess.check_output(["wkhtmltopdf", "--version"])
			wkhtmltopdf_version = res.decode("utf-8").split(" ")[1]
			frappe.cache.hset("wkhtmltopdf_version", None, wkhtmltopdf_version)
		except Exception:
			pass

	return wkhtmltopdf_version or "0"


def pdf_contains_js(file_content: bytes):
	"""
	Check if a PDF file contains JavaScript.

	Args:
		file_content (bytes): The content of the PDF file.

	Returns:
		bool: True if the PDF contains JavaScript, False otherwise and also if the file is encrypted.
	"""
	from io import BytesIO

	reader = PdfReader(BytesIO(file_content))

	def has_javascript(obj):
		if isinstance(obj, dict):
			for key, value in obj.items():
				if key in ("/JS", "/JavaScript"):
					return True
				if has_javascript(value):
					return True
		elif isinstance(obj, list):
			for item in obj:
				if has_javascript(item):
					return True
		return False

	root = reader.trailer.get("/Root", {})
	if has_javascript(root):
		return True

	try:
		for page in reader.pages:
			if has_javascript(page):
				return True
	except errors.FileNotDecryptedError:
		pass

	return False