seitime-frappe/frappe/utils/pdf.py

# Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
import base64
import contextlib
import io
import mimetypes
import os
import subprocess
from urllib.parse import parse_qs, urlparse

import cssutils
import pdfkit
import pdfkit.api
from pdfkit.pdfkit import PDFKit as OriginalPDFKit

pdfkit.source.unicode = str  # NOTE: upstream bug; PYTHONOPTIMIZE=1 optimized this away
from bs4 import BeautifulSoup
from packaging.version import Version
from pypdf import PdfReader, PdfWriter, errors

import frappe
from frappe import _
from frappe.core.doctype.file.utils import find_file_by_url
from frappe.utils import cstr, scrub_urls
from frappe.utils.caching import redis_cache
from frappe.utils.data import get_url
from frappe.utils.jinja_globals import bundled_asset, is_rtl

cssutils.log.setLog(frappe.logger("cssutils"))

PDF_CONTENT_ERRORS = [
	"ContentNotFoundError",
	"ContentOperationNotPermittedError",
	"UnknownContentError",
	"RemoteHostClosedError",
]


class FrappePDFKit(OriginalPDFKit):
	def _find_options_in_meta(self, content):
		"""Override to disable meta tag parsing.

		Returns an empty dict to prevent any wkhtmltopdf options from being
		extracted from HTML meta tags. Only options passed explicitly to the
		function should be used.
		"""
		return {}


# Replace PDFKit in all relevant modules
pdfkit.PDFKit = FrappePDFKit
pdfkit.pdfkit.PDFKit = FrappePDFKit
pdfkit.api.PDFKit = FrappePDFKit


def pdf_header_html(soup, head, content, styles, html_id, css, path=None):
	if not path:
		path = "templates/print_formats/pdf_header_footer.html"
	return frappe.render_template(
		path,
		{
			"head": head,
			"content": content,
			"styles": styles,
			"html_id": html_id,
			"css": css,
			"lang": frappe.local.lang,
			"layout_direction": "rtl" if is_rtl() else "ltr",
		},
	)


def pdf_body_html(template, args, **kwargs):
	try:
		return template.render(args, filters={"len": len})
	except Exception as e:
		# Guess line number ?
		frappe.throw(
			_("Error in print format on line {0}: {1}").format(
				_guess_template_error_line_number(template), str(e)
			),
			exc=frappe.PrintFormatError,
			title=_("Print Format Error"),
		)


def _guess_template_error_line_number(template) -> int | None:
	"""Guess line on which exception occurred from current traceback."""
	with contextlib.suppress(Exception):
		import sys
		import traceback

		_, _, tb = sys.exc_info()

		for frame in reversed(traceback.extract_tb(tb)):
			if template.filename in frame.filename:
				return frame.lineno


def pdf_footer_html(soup, head, content, styles, html_id, css, path=None):
	return pdf_header_html(
		soup=soup, head=head, content=content, styles=styles, html_id=html_id, css=css, path=path
	)


def get_pdf(html, options=None, output: PdfWriter | None = None):
	html = scrub_urls(html)
	html, options = prepare_options(html, options)

	options.update({"disable-javascript": "", "disable-local-file-access": ""})

	filedata = ""
	if Version(get_wkhtmltopdf_version()) > Version("0.12.3"):
		options.update({"disable-smart-shrinking": ""})

	try:
		# Set filename property to false, so no file is actually created
		filedata = pdfkit.from_string(html, options=options or {}, verbose=True)

		# create in-memory binary streams from filedata and create a PdfReader object
		reader = PdfReader(io.BytesIO(filedata))
	except OSError as e:
		if any([error in str(e) for error in PDF_CONTENT_ERRORS]):
			if not filedata:
				print(html, options)
				frappe.throw(_("PDF generation failed because of broken image links"))

			# allow pdfs with missing images if file got created
			if output:
				output.append_pages_from_reader(reader)
		else:
			raise
	finally:
		cleanup(options)

	if "password" in options:
		password = options["password"]

	if output:
		output.append_pages_from_reader(reader)
		return output

	writer = PdfWriter()
	writer.append_pages_from_reader(reader)

	if "password" in options:
		writer.encrypt(password)

	filedata = get_file_data_from_writer(writer)

	return filedata


def measure_time(func):
	import time

	def wrapper(*args, **kwargs):
		start_time = time.time()
		result = func(*args, **kwargs)
		end_time = time.time()
		print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds")
		return result

	return wrapper


@measure_time
def get_chrome_pdf(print_format, html, options, output, pdf_generator=None):
	from frappe.utils.pdf_generator.browser import Browser
	from frappe.utils.pdf_generator.chrome_pdf_generator import ChromePDFGenerator
	from frappe.utils.pdf_generator.pdf_merge import PDFTransformer

	if pdf_generator != "chrome":
		# Use the default pdf generator
		return
	# scrubbing url to expand url is not required as we have set url.
	# also, planning to remove network requests anyway 🤞
	generator = ChromePDFGenerator()
	browser = Browser(generator, print_format, html, options)
	transformer = PDFTransformer(browser)
	# transforms and merges header, footer into body pdf and returns merged pdf
	return transformer.transform_pdf(output=output)


def get_file_data_from_writer(writer_obj):
	# https://docs.python.org/3/library/io.html
	stream = io.BytesIO()
	writer_obj.write(stream)

	# Change the stream position to start of the stream
	stream.seek(0)

	# Read up to size bytes from the object and return them
	return stream.read()


def prepare_options(html, options):
	if not options:
		options = {}

	options.update(
		{
			"print-media-type": None,
			"background": None,
			"images": None,
			"quiet": None,
			# 'no-outline': None,
			"encoding": "UTF-8",
			# 'load-error-handling': 'ignore'
		}
	)

	if not options.get("margin-right"):
		options["margin-right"] = "15mm"

	if not options.get("margin-left"):
		options["margin-left"] = "15mm"

	html, html_options = read_options_from_html(html)
	options.update(html_options or {})

	# cookies
	options.update(get_cookie_options())
	html = inline_private_images(html)

	# page size
	pdf_page_size = (
		options.get("page-size") or frappe.db.get_single_value("Print Settings", "pdf_page_size") or "A4"
	)

	if pdf_page_size == "Custom":
		options["page-height"] = options.get("page-height") or frappe.db.get_single_value(
			"Print Settings", "pdf_page_height"
		)
		options["page-width"] = options.get("page-width") or frappe.db.get_single_value(
			"Print Settings", "pdf_page_width"
		)
	else:
		options["page-size"] = pdf_page_size

	return html, options


def get_cookie_options():
	options = {}
	if frappe.session and frappe.session.sid and hasattr(frappe.local, "request"):
		# Use wkhtmltopdf's cookie-jar feature to set cookies and restrict them to host domain
		cookiejar = f"/tmp/{frappe.generate_hash()}.jar"

		# Remove port from request.host
		# https://werkzeug.palletsprojects.com/en/0.16.x/wrappers/#werkzeug.wrappers.BaseRequest.host
		domain = frappe.utils.get_host_name().split(":", 1)[0]
		with open(cookiejar, "w") as f:
			f.write(f"sid={frappe.session.sid}; Domain={domain};\n")

		options["cookie-jar"] = cookiejar

	return options


def read_options_from_html(html):
	options = {}
	soup = BeautifulSoup(html, "html5lib")

	options.update(prepare_header_footer(soup))

	toggle_visible_pdf(soup)

	valid_styles = get_print_format_styles(soup)

	attrs = (
		"margin-top",
		"margin-bottom",
		"margin-left",
		"margin-right",
		"page-size",
		"header-spacing",
		"orientation",
		"page-width",
		"page-height",
	)
	options |= {style.name: style.value for style in valid_styles if style.name in attrs}
	return str(soup), options


def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]:
	"""
	Get styles purely on class 'print-format'.
	Valid:
	1) .print-format { ... }
	2) .print-format, p { ... } | p, .print-format { ... }

	Invalid (applied on child elements):
	1) .print-format p { ... } | .print-format > p { ... }
	2) .print-format #abc { ... }

	Returns:
	[cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...]
	"""
	stylesheet = ""
	style_tags = soup.find_all("style")

	# Prepare a css stylesheet from all the style tags' contents
	for style_tag in style_tags:
		stylesheet += cstr(style_tag.string)

	# Use css parser to tokenize the classes and their styles
	parsed_sheet = cssutils.parseString(stylesheet)

	# Get all styles that are only for .print-format
	valid_styles = []
	for rule in parsed_sheet:
		if not isinstance(rule, cssutils.css.CSSStyleRule):
			continue

		# Allow only .print-format { ... } and .print-format, p { ... }
		# Disallow .print-format p { ... } and .print-format > p { ... }
		if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]:
			valid_styles.extend(entry for entry in rule.style)

	return valid_styles


def inline_private_images(html) -> str:
	soup = BeautifulSoup(html, "html.parser")
	for img in soup.find_all("img"):
		if b64 := _get_base64_image(img["src"]):
			img["src"] = b64
	return str(soup)


def _get_base64_image(src):
	"""Return base64 version of image if user has permission to view it"""
	try:
		parsed_url = urlparse(src)
		path = parsed_url.path
		query = parse_qs(parsed_url.query)
		mime_type = mimetypes.guess_type(path)[0]
		if mime_type is None or not mime_type.startswith("image/"):
			return
		filename = (query.get("fid") and query["fid"][0]) or None
		file = find_file_by_url(path, name=filename)
		if not file or not file.is_private:
			return

		b64_encoded_image = base64.b64encode(file.get_content()).decode()
		return f"data:{mime_type};base64,{b64_encoded_image}"
	except Exception:
		frappe.logger("pdf").error("Failed to convert inline images to base64", exc_info=True)


def prepare_header_footer(soup: BeautifulSoup):
	options = {}

	head = soup.find("head").contents
	styles = soup.find_all("style")

	print_css = bundled_asset("print.bundle.css").lstrip("/")
	css = frappe.read_file(os.path.join(frappe.local.sites_path, print_css))

	# extract header and footer
	for html_id in ("header-html", "footer-html"):
		if content := soup.find(id=html_id):
			content = content.extract()
			# `header/footer-html` are extracted, rendered as html
			# and passed in wkhtmltopdf options (as '--header/footer-html')
			# Remove instances of them from main content for render_template
			for tag in soup.find_all(id=html_id):
				tag.extract()

			toggle_visible_pdf(content)
			id_map = {"header-html": "pdf_header_html", "footer-html": "pdf_footer_html"}
			hook_func = frappe.get_hooks(id_map.get(html_id))
			html = frappe.call(
				hook_func[-1],
				soup=soup,
				head=head,
				content=content,
				styles=styles,
				html_id=html_id,
				css=css,
			)

			# create temp file
			fname = os.path.join("/tmp", f"frappe-pdf-{frappe.generate_hash()}.html")
			with open(fname, "wb") as f:
				f.write(html.encode("utf-8"))

			# {"header-html": "/tmp/frappe-pdf-random.html"}
			options[html_id] = fname
		else:
			if html_id == "header-html":
				options["margin-top"] = "15mm"
			elif html_id == "footer-html":
				options["margin-bottom"] = "15mm"

	return options


def cleanup(options):
	for key in ("header-html", "footer-html", "cookie-jar"):
		if options.get(key) and os.path.exists(options[key]):
			os.remove(options[key])


def toggle_visible_pdf(soup):
	for tag in soup.find_all(attrs={"class": "visible-pdf"}):
		# remove visible-pdf class to unhide
		tag.attrs["class"].remove("visible-pdf")

	for tag in soup.find_all(attrs={"class": "hidden-pdf"}):
		# remove tag from html
		tag.extract()


@frappe.whitelist()
@redis_cache(ttl=60 * 60)
def is_wkhtmltopdf_valid():
	try:
		output = subprocess.check_output(["wkhtmltopdf", "--version"])
		return "qt" in output.decode("utf-8").lower()
	except Exception:
		return False


def get_wkhtmltopdf_version():
	wkhtmltopdf_version = frappe.cache.hget("wkhtmltopdf_version", None)

	if not wkhtmltopdf_version:
		try:
			res = subprocess.check_output(["wkhtmltopdf", "--version"])
			wkhtmltopdf_version = res.decode("utf-8").split(" ")[1]
			frappe.cache.hset("wkhtmltopdf_version", None, wkhtmltopdf_version)
		except Exception:
			pass

	return wkhtmltopdf_version or "0"


def pdf_contains_js(file_content: bytes):
	"""
	Check if a PDF file contains JavaScript.

	Args:
		file_content (bytes): The content of the PDF file.

	Returns:
		bool: True if the PDF contains JavaScript, False otherwise and also if the file is encrypted.
	"""
	from io import BytesIO

	reader = PdfReader(BytesIO(file_content))

	def has_javascript(obj):
		if isinstance(obj, dict):
			for key, value in obj.items():
				if key in ("/JS", "/JavaScript"):
					return True
				if has_javascript(value):
					return True
		elif isinstance(obj, list):
			for item in obj:
				if has_javascript(item):
					return True
		return False

	root = reader.trailer.get("/Root", {})
	if has_javascript(root):
		return True

	try:
		for page in reader.pages:
			if has_javascript(page):
				return True
	except errors.FileNotDecryptedError:
		pass

	return False


def get_host_url():
	if frappe.request:
		return frappe.request.host_url
	else:
		return get_url() + "/"