fix: don't try to scan encrypted PDFs

Also fix reversed logic of JS check

Signed-off-by: Akhil Narang <me@akhilnarang.dev>
This commit is contained in:
Akhil Narang 2025-09-10 15:40:54 +05:30
parent 38a7ab4be8
commit 36ccf32ab3
No known key found for this signature in database
GPG key ID: 9DCC61E211BF645F
2 changed files with 21 additions and 9 deletions

View file

@ -390,8 +390,8 @@ class File(Document):
)
def check_content(self):
if self.file_type == "PDF" and self._content and not pdf_contains_js(self._content):
frappe.throw(_("PDF cannot be uploaded, It contains unsafe content"))
if self.file_type == "PDF" and self._content and pdf_contains_js(self._content):
frappe.throw(_("This PDF cannot be uploaded as it contains unsafe content."))
def validate_duplicate_entry(self):
if not self.flags.ignore_duplicate_entry_error and not self.is_folder:

View file

@ -14,7 +14,7 @@ import pdfkit
pdfkit.source.unicode = str # NOTE: upstream bug; PYTHONOPTIMIZE=1 optimized this away
from bs4 import BeautifulSoup
from packaging.version import Version
from pypdf import PdfReader, PdfWriter
from pypdf import PdfReader, PdfWriter, errors
import frappe
from frappe import _
@ -386,7 +386,16 @@ def get_wkhtmltopdf_version():
return wkhtmltopdf_version or "0"
def pdf_contains_js(file_content):
def pdf_contains_js(file_content: bytes):
"""
Check if a PDF file contains JavaScript.
Args:
file_content (bytes): The content of the PDF file.
Returns:
bool: True if the PDF contains JavaScript, False otherwise and also if the file is encrypted.
"""
from io import BytesIO
reader = PdfReader(BytesIO(file_content))
@ -406,10 +415,13 @@ def pdf_contains_js(file_content):
root = reader.trailer.get("/Root", {})
if has_javascript(root):
return False
return True
for page in reader.pages:
if has_javascript(page):
return False
try:
for page in reader.pages:
if has_javascript(page):
return True
except errors.FileNotDecryptedError:
pass
return True
return False