From 577ba89c28f6e730977d27018649b77783411d4f Mon Sep 17 00:00:00 2001 From: sokumon Date: Thu, 26 Jun 2025 16:06:24 +0530 Subject: [PATCH 1/5] fix(XSS): verify pdf content before uploading --- frappe/core/doctype/file/file.py | 9 ++++++++- frappe/utils/pdf.py | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py index 5cbef2db16..37a91bbd54 100755 --- a/frappe/core/doctype/file/file.py +++ b/frappe/core/doctype/file/file.py @@ -20,6 +20,7 @@ from frappe.permissions import SYSTEM_USER_ROLE, get_doctypes_with_read from frappe.utils import call_hook_method, cint, get_files_path, get_hook_method, get_url from frappe.utils.file_manager import is_safe_path from frappe.utils.image import optimize_image, strip_exif_data +from frappe.utils.pdf import is_pdf_safe from .exceptions import ( AttachmentLimitReached, @@ -131,8 +132,8 @@ class File(Document): self.validate_file_path() self.validate_file_url() self.validate_file_on_disk() - self.file_size = frappe.form_dict.file_size or self.file_size + self.check_for_malicious_content() def validate_attachment_references(self): if not self.attached_to_doctype: @@ -374,6 +375,12 @@ class File(Document): if self.file_type not in allowed_extensions.splitlines(): frappe.throw(_("File type of {0} is not allowed").format(self.file_type), exc=FileTypeNotAllowed) + def check_for_malicious_content(self): + file_name = self.file_url.split("/")[-1] + print(self.file_type) + if self.file_type == "PDF" and not is_pdf_safe(get_files_path(file_name, is_private=self.is_private)): + frappe.throw("PDF contains malicious content") + def validate_duplicate_entry(self): if not self.flags.ignore_duplicate_entry_error and not self.is_folder: if not self.content_hash: diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index 9f0469f0b4..1de1060122 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -384,3 +384,30 @@ def get_wkhtmltopdf_version(): pass return wkhtmltopdf_version or "0" + + +def is_pdf_safe(pdf_path): + reader = PdfReader(pdf_path) + + def has_javascript(obj): + if isinstance(obj, dict): + for key, value in obj.items(): + if key in ("/JS", "/JavaScript"): + return True + if has_javascript(value): + return True + elif isinstance(obj, list): + for item in obj: + if has_javascript(item): + return True + return False + + root = reader.trailer.get("/Root", {}) + if has_javascript(root): + return False + + for page in reader.pages: + if has_javascript(page): + return False + + return True From ca5831b1a8eb5593895cce584d6aa7395f54db6a Mon Sep 17 00:00:00 2001 From: sokumon Date: Tue, 12 Aug 2025 23:55:33 +0530 Subject: [PATCH 2/5] fix: better function name, check before writing the file --- frappe/core/doctype/file/file.py | 10 ++++------ frappe/utils/pdf.py | 6 ++++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py index 37a91bbd54..76e8f8691d 100755 --- a/frappe/core/doctype/file/file.py +++ b/frappe/core/doctype/file/file.py @@ -133,7 +133,7 @@ class File(Document): self.validate_file_url() self.validate_file_on_disk() self.file_size = frappe.form_dict.file_size or self.file_size - self.check_for_malicious_content() + self.check_content() def validate_attachment_references(self): if not self.attached_to_doctype: @@ -375,10 +375,8 @@ class File(Document): if self.file_type not in allowed_extensions.splitlines(): frappe.throw(_("File type of {0} is not allowed").format(self.file_type), exc=FileTypeNotAllowed) - def check_for_malicious_content(self): - file_name = self.file_url.split("/")[-1] - print(self.file_type) - if self.file_type == "PDF" and not is_pdf_safe(get_files_path(file_name, is_private=self.is_private)): + def check_content(self): + if self.file_type == "PDF" and not is_pdf_safe(self._content): frappe.throw("PDF contains malicious content") def validate_duplicate_entry(self): @@ -641,7 +639,7 @@ class File(Document): if isinstance(self._content, str): self._content = self._content.encode() - + self.check_content() with open(file_path, "wb+") as f: f.write(self._content) os.fsync(f.fileno()) diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index 1de1060122..fa420121f5 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -386,8 +386,10 @@ def get_wkhtmltopdf_version(): return wkhtmltopdf_version or "0" -def is_pdf_safe(pdf_path): - reader = PdfReader(pdf_path) +def is_pdf_safe(file_content): + from io import BytesIO + + reader = PdfReader(BytesIO(file_content)) def has_javascript(obj): if isinstance(obj, dict): From bbc093640b00a1a19e5431a7c714c4c2fdbcbe3d Mon Sep 17 00:00:00 2001 From: sokumon Date: Mon, 18 Aug 2025 15:14:53 +0530 Subject: [PATCH 3/5] fix: better message --- frappe/core/doctype/file/file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py index 76e8f8691d..6cea08e050 100755 --- a/frappe/core/doctype/file/file.py +++ b/frappe/core/doctype/file/file.py @@ -377,7 +377,7 @@ class File(Document): def check_content(self): if self.file_type == "PDF" and not is_pdf_safe(self._content): - frappe.throw("PDF contains malicious content") + frappe.throw(_("PDF cannot be uploaded, It contains unsafe content")) def validate_duplicate_entry(self): if not self.flags.ignore_duplicate_entry_error and not self.is_folder: From fd291031496f6aca4c758e8cb17e920ca92a27f2 Mon Sep 17 00:00:00 2001 From: sokumon Date: Mon, 25 Aug 2025 14:27:23 +0530 Subject: [PATCH 4/5] fix: make the checking more robust --- frappe/core/doctype/file/file.py | 4 +-- frappe/utils/pdf.py | 56 +++++++++++++++++++------------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/frappe/core/doctype/file/file.py b/frappe/core/doctype/file/file.py index 6cea08e050..18bf4aed65 100755 --- a/frappe/core/doctype/file/file.py +++ b/frappe/core/doctype/file/file.py @@ -20,7 +20,7 @@ from frappe.permissions import SYSTEM_USER_ROLE, get_doctypes_with_read from frappe.utils import call_hook_method, cint, get_files_path, get_hook_method, get_url from frappe.utils.file_manager import is_safe_path from frappe.utils.image import optimize_image, strip_exif_data -from frappe.utils.pdf import is_pdf_safe +from frappe.utils.pdf import pdf_contains_js from .exceptions import ( AttachmentLimitReached, @@ -376,7 +376,7 @@ class File(Document): frappe.throw(_("File type of {0} is not allowed").format(self.file_type), exc=FileTypeNotAllowed) def check_content(self): - if self.file_type == "PDF" and not is_pdf_safe(self._content): + if self.file_type == "PDF" and not pdf_contains_js(self._content): frappe.throw(_("PDF cannot be uploaded, It contains unsafe content")) def validate_duplicate_entry(self): diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index fa420121f5..bc6f06afba 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -386,30 +386,42 @@ def get_wkhtmltopdf_version(): return wkhtmltopdf_version or "0" -def is_pdf_safe(file_content): - from io import BytesIO +def pdf_contains_js(file_content: bytes) -> bool: + """ + Robustly checks if a PDF's content contains embedded JavaScript. + Returns True if JS is found, False otherwise. + """ + from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject - reader = PdfReader(BytesIO(file_content)) + try: + reader = PdfReader(io.BytesIO(file_content)) - def has_javascript(obj): - if isinstance(obj, dict): - for key, value in obj.items(): - if key in ("/JS", "/JavaScript"): + def has_javascript_deep(obj): + """ + Recursively search for JavaScript actions in the PDF object tree, + handling indirect objects and various PDF data structures. + """ + if isinstance(obj, IndirectObject): + obj = obj.get_object() + + if isinstance(obj, DictionaryObject): + if obj.get("/S") == "/JavaScript" or "/JS" in obj: return True - if has_javascript(value): - return True - elif isinstance(obj, list): - for item in obj: - if has_javascript(item): - return True - return False - - root = reader.trailer.get("/Root", {}) - if has_javascript(root): - return False - - for page in reader.pages: - if has_javascript(page): + for value in obj.values(): + if has_javascript_deep(value): + return True + elif isinstance(obj, ArrayObject): + for item in obj: + if has_javascript_deep(item): + return True return False - return True + if has_javascript_deep(reader.trailer): + return True + + except Exception as e: + print(f"An error occurred during PDF parsing: {e}") + # If parsing fails, it could be a malformed/malicious file. + return False + + return False From 5e21a2e2abd28f46d881b8424a7e762ecbf99040 Mon Sep 17 00:00:00 2001 From: sokumon Date: Tue, 2 Sep 2025 14:46:42 +0530 Subject: [PATCH 5/5] fix: relax the logic --- frappe/utils/pdf.py | 56 ++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/frappe/utils/pdf.py b/frappe/utils/pdf.py index bc6f06afba..8f0d4f7fed 100644 --- a/frappe/utils/pdf.py +++ b/frappe/utils/pdf.py @@ -386,42 +386,30 @@ def get_wkhtmltopdf_version(): return wkhtmltopdf_version or "0" -def pdf_contains_js(file_content: bytes) -> bool: - """ - Robustly checks if a PDF's content contains embedded JavaScript. - Returns True if JS is found, False otherwise. - """ - from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject +def pdf_contains_js(file_content): + from io import BytesIO - try: - reader = PdfReader(io.BytesIO(file_content)) + reader = PdfReader(BytesIO(file_content)) - def has_javascript_deep(obj): - """ - Recursively search for JavaScript actions in the PDF object tree, - handling indirect objects and various PDF data structures. - """ - if isinstance(obj, IndirectObject): - obj = obj.get_object() - - if isinstance(obj, DictionaryObject): - if obj.get("/S") == "/JavaScript" or "/JS" in obj: + def has_javascript(obj): + if isinstance(obj, dict): + for key, value in obj.items(): + if key in ("/JS", "/JavaScript"): + return True + if has_javascript(value): + return True + elif isinstance(obj, list): + for item in obj: + if has_javascript(item): return True - for value in obj.values(): - if has_javascript_deep(value): - return True - elif isinstance(obj, ArrayObject): - for item in obj: - if has_javascript_deep(item): - return True - return False - - if has_javascript_deep(reader.trailer): - return True - - except Exception as e: - print(f"An error occurred during PDF parsing: {e}") - # If parsing fails, it could be a malformed/malicious file. return False - return False + root = reader.trailer.get("/Root", {}) + if has_javascript(root): + return False + + for page in reader.pages: + if has_javascript(page): + return False + + return True