seitime-frappe/frappe/utils/pdf.py

458 lines
12 KiB
Python

# Copyright (c) 2015, Frappe Technologies Pvt. Ltd. and Contributors
# License: MIT. See LICENSE
import base64
import contextlib
import io
import mimetypes
import os
import subprocess
from urllib.parse import parse_qs, urlparse
import cssutils
import pdfkit
pdfkit.source.unicode = str # NOTE: upstream bug; PYTHONOPTIMIZE=1 optimized this away
from bs4 import BeautifulSoup
from packaging.version import Version
from pypdf import PdfReader, PdfWriter, errors
import frappe
from frappe import _
from frappe.core.doctype.file.utils import find_file_by_url
from frappe.utils import cstr, scrub_urls
from frappe.utils.caching import redis_cache
from frappe.utils.jinja_globals import bundled_asset, is_rtl
cssutils.log.setLog(frappe.logger("cssutils"))
PDF_CONTENT_ERRORS = [
"ContentNotFoundError",
"ContentOperationNotPermittedError",
"UnknownContentError",
"RemoteHostClosedError",
]
def pdf_header_html(soup, head, content, styles, html_id, css, path=None):
if not path:
path = "templates/print_formats/pdf_header_footer.html"
return frappe.render_template(
path,
{
"head": head,
"content": content,
"styles": styles,
"html_id": html_id,
"css": css,
"lang": frappe.local.lang,
"layout_direction": "rtl" if is_rtl() else "ltr",
},
)
def pdf_body_html(template, args, **kwargs):
try:
return template.render(args, filters={"len": len})
except Exception as e:
# Guess line number ?
frappe.throw(
_("Error in print format on line {0}: {1}").format(
_guess_template_error_line_number(template), e
),
exc=frappe.PrintFormatError,
title=_("Print Format Error"),
)
def _guess_template_error_line_number(template) -> int | None:
"""Guess line on which exception occurred from current traceback."""
with contextlib.suppress(Exception):
import sys
import traceback
_, _, tb = sys.exc_info()
for frame in reversed(traceback.extract_tb(tb)):
if template.filename in frame.filename:
return frame.lineno
def pdf_footer_html(soup, head, content, styles, html_id, css, path=None):
return pdf_header_html(
soup=soup, head=head, content=content, styles=styles, html_id=html_id, css=css, path=path
)
def get_pdf(html, options=None, output: PdfWriter | None = None):
html = scrub_urls(html)
html, options = prepare_options(html, options)
options.update({"disable-javascript": "", "disable-local-file-access": ""})
filedata = ""
if Version(get_wkhtmltopdf_version()) > Version("0.12.3"):
options.update({"disable-smart-shrinking": ""})
try:
# Set filename property to false, so no file is actually created
filedata = pdfkit.from_string(html, options=options or {}, verbose=True)
# create in-memory binary streams from filedata and create a PdfReader object
reader = PdfReader(io.BytesIO(filedata))
except OSError as e:
if any([error in str(e) for error in PDF_CONTENT_ERRORS]):
if not filedata:
print(html, options)
frappe.throw(_("PDF generation failed because of broken image links"))
# allow pdfs with missing images if file got created
if output:
output.append_pages_from_reader(reader)
else:
raise
finally:
cleanup(options)
if "password" in options:
password = options["password"]
if output:
output.append_pages_from_reader(reader)
return output
writer = PdfWriter()
writer.append_pages_from_reader(reader)
if "password" in options:
writer.encrypt(password)
filedata = get_file_data_from_writer(writer)
return filedata
def measure_time(func):
import time
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"Function {func.__name__} took {end_time - start_time:.4f} seconds")
return result
return wrapper
@measure_time
def get_chrome_pdf(print_format, html, options, output, pdf_generator=None):
from frappe.utils.pdf_generator.browser import Browser
from frappe.utils.pdf_generator.chrome_pdf_generator import ChromePDFGenerator
from frappe.utils.pdf_generator.pdf_merge import PDFTransformer
if pdf_generator != "chrome":
# Use the default pdf generator
return
# scrubbing url to expand url is not required as we have set url.
# also, planning to remove network requests anyway 🤞
generator = ChromePDFGenerator()
browser = Browser(generator, print_format, html, options)
transformer = PDFTransformer(browser)
# transforms and merges header, footer into body pdf and returns merged pdf
return transformer.transform_pdf(output=output)
def get_file_data_from_writer(writer_obj):
# https://docs.python.org/3/library/io.html
stream = io.BytesIO()
writer_obj.write(stream)
# Change the stream position to start of the stream
stream.seek(0)
# Read up to size bytes from the object and return them
return stream.read()
def prepare_options(html, options):
if not options:
options = {}
options.update(
{
"print-media-type": None,
"background": None,
"images": None,
"quiet": None,
# 'no-outline': None,
"encoding": "UTF-8",
# 'load-error-handling': 'ignore'
}
)
if not options.get("margin-right"):
options["margin-right"] = "15mm"
if not options.get("margin-left"):
options["margin-left"] = "15mm"
html, html_options = read_options_from_html(html)
options.update(html_options or {})
# cookies
options.update(get_cookie_options())
html = inline_private_images(html)
# page size
pdf_page_size = (
options.get("page-size") or frappe.db.get_single_value("Print Settings", "pdf_page_size") or "A4"
)
if pdf_page_size == "Custom":
options["page-height"] = options.get("page-height") or frappe.db.get_single_value(
"Print Settings", "pdf_page_height"
)
options["page-width"] = options.get("page-width") or frappe.db.get_single_value(
"Print Settings", "pdf_page_width"
)
else:
options["page-size"] = pdf_page_size
return html, options
def get_cookie_options():
options = {}
if frappe.session and frappe.session.sid and hasattr(frappe.local, "request"):
# Use wkhtmltopdf's cookie-jar feature to set cookies and restrict them to host domain
cookiejar = f"/tmp/{frappe.generate_hash()}.jar"
# Remove port from request.host
# https://werkzeug.palletsprojects.com/en/0.16.x/wrappers/#werkzeug.wrappers.BaseRequest.host
domain = frappe.utils.get_host_name().split(":", 1)[0]
with open(cookiejar, "w") as f:
f.write(f"sid={frappe.session.sid}; Domain={domain};\n")
options["cookie-jar"] = cookiejar
return options
def read_options_from_html(html):
options = {}
soup = BeautifulSoup(html, "html5lib")
options.update(prepare_header_footer(soup))
toggle_visible_pdf(soup)
valid_styles = get_print_format_styles(soup)
attrs = (
"margin-top",
"margin-bottom",
"margin-left",
"margin-right",
"page-size",
"header-spacing",
"orientation",
"page-width",
"page-height",
)
options |= {style.name: style.value for style in valid_styles if style.name in attrs}
return str(soup), options
def get_print_format_styles(soup: BeautifulSoup) -> list[cssutils.css.Property]:
"""
Get styles purely on class 'print-format'.
Valid:
1) .print-format { ... }
2) .print-format, p { ... } | p, .print-format { ... }
Invalid (applied on child elements):
1) .print-format p { ... } | .print-format > p { ... }
2) .print-format #abc { ... }
Returns:
[cssutils.css.Property(name='margin-top', value='50mm', priority=''), ...]
"""
stylesheet = ""
style_tags = soup.find_all("style")
# Prepare a css stylesheet from all the style tags' contents
for style_tag in style_tags:
stylesheet += cstr(style_tag.string)
# Use css parser to tokenize the classes and their styles
parsed_sheet = cssutils.parseString(stylesheet)
# Get all styles that are only for .print-format
valid_styles = []
for rule in parsed_sheet:
if not isinstance(rule, cssutils.css.CSSStyleRule):
continue
# Allow only .print-format { ... } and .print-format, p { ... }
# Disallow .print-format p { ... } and .print-format > p { ... }
if ".print-format" in [x.strip() for x in rule.selectorText.split(",")]:
valid_styles.extend(entry for entry in rule.style)
return valid_styles
def inline_private_images(html) -> str:
soup = BeautifulSoup(html, "html.parser")
for img in soup.find_all("img"):
if b64 := _get_base64_image(img["src"]):
img["src"] = b64
return str(soup)
def _get_base64_image(src):
"""Return base64 version of image if user has permission to view it"""
try:
parsed_url = urlparse(src)
path = parsed_url.path
query = parse_qs(parsed_url.query)
mime_type = mimetypes.guess_type(path)[0]
if mime_type is None or not mime_type.startswith("image/"):
return
filename = (query.get("fid") and query["fid"][0]) or None
file = find_file_by_url(path, name=filename)
if not file or not file.is_private:
return
b64_encoded_image = base64.b64encode(file.get_content()).decode()
return f"data:{mime_type};base64,{b64_encoded_image}"
except Exception:
frappe.logger("pdf").error("Failed to convert inline images to base64", exc_info=True)
def prepare_header_footer(soup: BeautifulSoup):
options = {}
head = soup.find("head").contents
styles = soup.find_all("style")
print_css = bundled_asset("print.bundle.css").lstrip("/")
css = frappe.read_file(os.path.join(frappe.local.sites_path, print_css))
# extract header and footer
for html_id in ("header-html", "footer-html"):
if content := soup.find(id=html_id):
content = content.extract()
# `header/footer-html` are extracted, rendered as html
# and passed in wkhtmltopdf options (as '--header/footer-html')
# Remove instances of them from main content for render_template
for tag in soup.find_all(id=html_id):
tag.extract()
toggle_visible_pdf(content)
id_map = {"header-html": "pdf_header_html", "footer-html": "pdf_footer_html"}
hook_func = frappe.get_hooks(id_map.get(html_id))
html = frappe.call(
hook_func[-1],
soup=soup,
head=head,
content=content,
styles=styles,
html_id=html_id,
css=css,
)
# create temp file
fname = os.path.join("/tmp", f"frappe-pdf-{frappe.generate_hash()}.html")
with open(fname, "wb") as f:
f.write(html.encode("utf-8"))
# {"header-html": "/tmp/frappe-pdf-random.html"}
options[html_id] = fname
else:
if html_id == "header-html":
options["margin-top"] = "15mm"
elif html_id == "footer-html":
options["margin-bottom"] = "15mm"
return options
def cleanup(options):
for key in ("header-html", "footer-html", "cookie-jar"):
if options.get(key) and os.path.exists(options[key]):
os.remove(options[key])
def toggle_visible_pdf(soup):
for tag in soup.find_all(attrs={"class": "visible-pdf"}):
# remove visible-pdf class to unhide
tag.attrs["class"].remove("visible-pdf")
for tag in soup.find_all(attrs={"class": "hidden-pdf"}):
# remove tag from html
tag.extract()
@frappe.whitelist()
@redis_cache(ttl=60 * 60)
def is_wkhtmltopdf_valid():
try:
output = subprocess.check_output(["wkhtmltopdf", "--version"])
return "qt" in output.decode("utf-8").lower()
except Exception:
return False
def get_wkhtmltopdf_version():
wkhtmltopdf_version = frappe.cache.hget("wkhtmltopdf_version", None)
if not wkhtmltopdf_version:
try:
res = subprocess.check_output(["wkhtmltopdf", "--version"])
wkhtmltopdf_version = res.decode("utf-8").split(" ")[1]
frappe.cache.hset("wkhtmltopdf_version", None, wkhtmltopdf_version)
except Exception:
pass
return wkhtmltopdf_version or "0"
def pdf_contains_js(file_content: bytes):
"""
Check if a PDF file contains JavaScript.
Args:
file_content (bytes): The content of the PDF file.
Returns:
bool: True if the PDF contains JavaScript, False otherwise and also if the file is encrypted.
"""
from io import BytesIO
reader = PdfReader(BytesIO(file_content))
def has_javascript(obj):
if isinstance(obj, dict):
for key, value in obj.items():
if key in ("/JS", "/JavaScript"):
return True
if has_javascript(value):
return True
elif isinstance(obj, list):
for item in obj:
if has_javascript(item):
return True
return False
root = reader.trailer.get("/Root", {})
if has_javascript(root):
return True
try:
for page in reader.pages:
if has_javascript(page):
return True
except errors.FileNotDecryptedError:
pass
return False