From 4ad934719b8399344296a54a1f5a03785fff556d Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Mon, 17 Nov 2025 18:26:11 +0000 Subject: [PATCH 01/13] chore: Replace bleach HTML sanitiser for nh3 --- frappe/utils/html_utils.py | 112 ++++++++++++++++--------------------- pyproject.toml | 2 +- 2 files changed, 48 insertions(+), 66 deletions(-) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index 11b289e114..807c901caa 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -1,6 +1,7 @@ import json import re +import nh3 from bleach_allowlist import bleach_allowlist import frappe @@ -18,12 +19,10 @@ EMOJI_PATTERN = re.compile( def clean_html(html): - import bleach - if not isinstance(html, str): return html - return bleach.clean( + return nh3.clean( clean_script_and_style(html), tags={ "div", @@ -43,56 +42,49 @@ def clean_html(html): "td", "tr", }, - attributes=[], - strip=True, strip_comments=True, ) def clean_email_html(html): - import bleach - from bleach.css_sanitizer import CSSSanitizer - if not isinstance(html, str): return html - css_sanitizer = CSSSanitizer( - allowed_css_properties=[ - "color", - "border-color", - "width", - "height", - "max-width", - "background-color", - "border-collapse", - "border-radius", - "border", - "border-top", - "border-bottom", - "border-left", - "border-right", - "margin", - "margin-top", - "margin-bottom", - "margin-left", - "margin-right", - "padding", - "padding-top", - "padding-bottom", - "padding-left", - "padding-right", - "font-size", - "font-weight", - "font-family", - "text-decoration", - "line-height", - "text-align", - "vertical-align", - "display", - ] - ) + allowed_css_properties = { + "color", + "border-color", + "width", + "height", + "max-width", + "background-color", + "border-collapse", + "border-radius", + "border", + "border-top", + "border-bottom", + "border-left", + "border-right", + "margin", + "margin-top", + "margin-bottom", + "margin-left", + "margin-right", + "padding", + "padding-top", + "padding-bottom", + "padding-left", + "padding-right", + "font-size", + "font-weight", + "font-family", + "text-decoration", + "line-height", + "text-align", + "vertical-align", + "display", + } - return bleach.clean( + return nh3.clean( clean_script_and_style(html), tags={ "div", @@ -124,10 +116,8 @@ def clean_email_html(html): "button", "img", }, - attributes=["border", "colspan", "rowspan", "src", "href", "style", "id"], - css_sanitizer=css_sanitizer, - protocols=["cid", "http", "https", "mailto", "data", "tel"], - strip=True, + attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}}, + filter_style_properties=allowed_css_properties, strip_comments=True, ) @@ -145,12 +135,11 @@ def clean_script_and_style(html): def sanitize_html(html, linkify=False, always_sanitize=False): """ Sanitize HTML tags, attributes and style to prevent XSS attacks - Based on bleach clean, bleach whitelist and html5lib's Sanitizer defaults + Based on nh3 clean (formerly bleach clean), bleach whitelist and html5lib's + Sanitizer defaults Does not sanitize JSON unless explicitly specified, as it could lead to future problems """ - import bleach - from bleach.css_sanitizer import CSSSanitizer from bs4 import BeautifulSoup if not isinstance(html, str): @@ -164,28 +153,21 @@ def sanitize_html(html, linkify=False, always_sanitize=False): return html tags = ( - acceptable_elements - + svg_elements - + mathml_elements - + ["html", "head", "meta", "link", "body", "style", "o:p"] + acceptable_elements.union(svg_elements) + .union(mathml_elements) + .union(["html", "head", "meta", "link", "body", "o:p"]) ) - def attributes_filter(tag, name, value): - if name.startswith("data-"): - return True - return name in acceptable_attributes - - attributes = {"*": attributes_filter, "svg": svg_attributes} - css_sanitizer = CSSSanitizer(allowed_css_properties=bleach_allowlist.all_styles) + attributes = {"*": acceptable_attributes, "svg": svg_attributes} # returns html with escaped tags, escaped orphan >, <, etc. - escaped_html = bleach.clean( + escaped_html = nh3.clean( html, tags=tags, attributes=attributes, - css_sanitizer=css_sanitizer, + generic_attribute_prefixes={"data-"}, strip_comments=False, - protocols={"cid", "http", "https", "mailto", "tel"}, + filter_style_properties=set(bleach_allowlist.all_styles), ) return escaped_html diff --git a/pyproject.toml b/pyproject.toml index 051a084b01..e1c028c420 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,6 @@ dependencies = [ "Whoosh~=2.7.4", "beautifulsoup4~=4.13.5", "bleach-allowlist~=1.0.3", - "bleach[css]~=6.3.0", "chardet~=5.2.0", "croniter~=6.0.0", "cryptography~=46.0.3", @@ -44,6 +43,7 @@ dependencies = [ "ldap3~=2.9.1", "markdown2~=2.5.4", "MarkupSafe~=3.0.3", + "nh3>=0.3.2", "num2words~=0.5.14", "oauthlib~=3.2.2", "openpyxl~=3.1.5", From 08fc19d032c966bb2af5eedac6cdc7f151205cb8 Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Mon, 17 Nov 2025 20:31:34 +0000 Subject: [PATCH 02/13] chore: Remove rel acceptable attribute --- frappe/utils/html_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index 807c901caa..ef555825b7 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -483,7 +483,6 @@ acceptable_attributes = [ "prompt", "radiogroup", "readonly", - "rel", "repeat-max", "repeat-min", "replace", From 2af319bb23f1e1b2f199a1d16eed4c2ed455701e Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Mon, 17 Nov 2025 21:55:44 +0000 Subject: [PATCH 03/13] chore: bleach-nh3. Convert lists to sets (again; fighting against pre-commit making indent changes that obscured 'rel' from acceptable_attributes) --- frappe/utils/html_utils.py | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index ef555825b7..cc7ac6cabc 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -207,7 +207,7 @@ def unescape_html(value): # adapted from https://raw.githubusercontent.com/html5lib/html5lib-python/4aa79f113e7486c7ec5d15a6e1777bfe546d3259/html5lib/sanitizer.py -acceptable_elements = [ +acceptable_elements = { "a", "abbr", "acronym", @@ -309,9 +309,9 @@ acceptable_elements = [ "ul", "var", "video", -] +} -mathml_elements = [ +mathml_elements = { "maction", "math", "merror", @@ -339,9 +339,9 @@ mathml_elements = [ "munder", "munderover", "none", -] +} -svg_elements = [ +svg_elements = { "a", "animate", "animateColor", @@ -377,9 +377,9 @@ svg_elements = [ "title", "tspan", "use", -] +} -acceptable_attributes = [ +acceptable_attributes = { "abbr", "accept", "accept-charset", @@ -540,16 +540,13 @@ acceptable_attributes = [ "itemtype", "itemid", "itemref", - "datetime", "data-is-group", -] +} -mathml_attributes = [ +mathml_attributes = { "actiontype", "align", "columnalign", - "columnalign", - "columnalign", "columnlines", "columnspacing", "columnspan", @@ -568,13 +565,10 @@ mathml_attributes = [ "mathbackground", "mathcolor", "mathvariant", - "mathvariant", "maxsize", "minsize", "other", "rowalign", - "rowalign", - "rowalign", "rowlines", "rowspacing", "rowspan", @@ -584,15 +578,14 @@ mathml_attributes = [ "separator", "stretchy", "width", - "width", "xlink:href", "xlink:show", "xlink:type", "xmlns", "xmlns:xlink", -] +} -svg_attributes = [ +svg_attributes = { "accent-height", "accumulate", "additive", @@ -735,4 +728,4 @@ svg_attributes = [ "y1", "y2", "zoomAndPan", -] +} From 5e7c8da8a6c420a085b6009cd663476e6e6b3171 Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Tue, 18 Nov 2025 11:54:28 +0000 Subject: [PATCH 04/13] fix: Allow previously allowed href protocols (cid: and data:) --- frappe/utils/html_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index cc7ac6cabc..b440e42c32 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -119,6 +119,7 @@ def clean_email_html(html): attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}}, filter_style_properties=allowed_css_properties, strip_comments=True, + url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}), ) @@ -168,6 +169,7 @@ def sanitize_html(html, linkify=False, always_sanitize=False): generic_attribute_prefixes={"data-"}, strip_comments=False, filter_style_properties=set(bleach_allowlist.all_styles), + url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid"}), ) return escaped_html From 3b1ae43e94acca8559bb77c6e415af0fa67a5e30 Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Fri, 21 Nov 2025 09:23:37 +0000 Subject: [PATCH 05/13] fix: Replace global bleach import with nh3 --- frappe/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/frappe/app.py b/frappe/app.py index 49cacb9a46..010b5bde0d 100644 --- a/frappe/app.py +++ b/frappe/app.py @@ -40,7 +40,7 @@ import gettext import babel import babel.messages -import bleach +import nh3 import num2words import pydantic From a5ef0104f1bb80a26f095aca6f4220305006eda0 Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Fri, 21 Nov 2025 16:36:35 +0000 Subject: [PATCH 06/13] fix: Update failing email_account unit tests that don't play with nh3 --- frappe/email/doctype/email_account/test_email_account.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/frappe/email/doctype/email_account/test_email_account.py b/frappe/email/doctype/email_account/test_email_account.py index d0d77a180f..c6f88c2862 100644 --- a/frappe/email/doctype/email_account/test_email_account.py +++ b/frappe/email/doctype/email_account/test_email_account.py @@ -132,7 +132,7 @@ class TestEmailAccount(IntegrationTestCase): TestEmailAccount.mocked_email_receive(email_account, messages) comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"}) - self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content) + self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content) self.assertTrue( "This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content ) @@ -153,7 +153,7 @@ class TestEmailAccount(IntegrationTestCase): TestEmailAccount.mocked_email_receive(email_account, messages) comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"}) - self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content) + self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content) self.assertTrue( "This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content ) From 8a826996404f39621432fe3a255d7c2cf0c024ae Mon Sep 17 00:00:00 2001 From: Alex Leach Date: Fri, 21 Nov 2025 23:47:23 +0000 Subject: [PATCH 07/13] fix(tests): Update more tests to conform with nh3 behaviour --- frappe/tests/test_document.py | 3 ++- frappe/tests/test_utils.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/frappe/tests/test_document.py b/frappe/tests/test_document.py index f1f3705470..27ffe39fca 100644 --- a/frappe/tests/test_document.py +++ b/frappe/tests/test_document.py @@ -262,6 +262,7 @@ class TestDocument(IntegrationTestCase): def test_xss_filter(self): d = self.test_insert() + subject = d.subject # script xss = '' @@ -271,7 +272,7 @@ class TestDocument(IntegrationTestCase): d.reload() self.assertTrue(xss not in d.subject) - self.assertTrue(escaped_xss in d.subject) + self.assertEqual(subject, d.subject) # onload xss = '
Test
' diff --git a/frappe/tests/test_utils.py b/frappe/tests/test_utils.py index a87b7a3793..cd074696e0 100644 --- a/frappe/tests/test_utils.py +++ b/frappe/tests/test_utils.py @@ -508,7 +508,7 @@ class TestHTMLUtils(IntegrationTestCase): sample = """

Hello

Para

text""" clean = clean_email_html(sample) self.assertTrue("

Hello

" in clean) - self.assertTrue('text' in clean) + self.assertTrue('text' in clean) def test_sanitize_html(self): from frappe.utils.html_utils import sanitize_html From 125bd67eadd1786cbfec42999e9ba99e8a5d5bfb Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:39:08 +0530 Subject: [PATCH 08/13] revert: undo manual pinning for tinycss2 --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e1cd00aad5..d9a263e291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,8 +27,6 @@ dependencies = [ "PyYAML~=6.0.3", "RestrictedPython~=8.1", "WeasyPrint==68.0", - # we don't use tinycss2 directly, but pinned to ensure compatibility with WeasyPrint and bleach - "tinycss2~=1.5.1,<1.6", "pydyf==0.12.1", "Werkzeug==3.1.5", "Whoosh~=2.7.4", From 61bc172d95b7ea62562e81e8e9ff7f6068ce5530 Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:45:23 +0530 Subject: [PATCH 09/13] test: remove unused var --- frappe/tests/test_document.py | 1 - 1 file changed, 1 deletion(-) diff --git a/frappe/tests/test_document.py b/frappe/tests/test_document.py index 27ffe39fca..8c4ed6ac01 100644 --- a/frappe/tests/test_document.py +++ b/frappe/tests/test_document.py @@ -266,7 +266,6 @@ class TestDocument(IntegrationTestCase): # script xss = '' - escaped_xss = xss.replace("<", "<").replace(">", ">") d.subject += xss d.save() d.reload() From 84359c8d4951b5007de816ced98f35c324893c9b Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 14:48:10 +0530 Subject: [PATCH 10/13] chore: improve misleading docstring --- frappe/utils/html_utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index b440e42c32..e53983f3dc 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -136,8 +136,7 @@ def clean_script_and_style(html): def sanitize_html(html, linkify=False, always_sanitize=False): """ Sanitize HTML tags, attributes and style to prevent XSS attacks - Based on nh3 clean (formerly bleach clean), bleach whitelist and html5lib's - Sanitizer defaults + Based on nh3 clean, bleach whitelist and html5lib's Sanitizer defaults Does not sanitize JSON unless explicitly specified, as it could lead to future problems """ From ca10a3af7a299107f1f0255e024b3181826a9554 Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:05:24 +0530 Subject: [PATCH 11/13] build: improve nh3 dependency specification --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d9a263e291..82a7c83a63 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ dependencies = [ "ldap3~=2.9.1", "markdown2~=2.5.4", "MarkupSafe~=3.0.3", - "nh3>=0.3.2", + "nh3~=0.3.2", "num2words~=0.5.14", "oauthlib~=3.2.2", "openpyxl~=3.1.5", From 5e2c526b9e82ddfc7e38a19f3e2a9c002b3b5fcd Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:18:05 +0530 Subject: [PATCH 12/13] perf: prefer nh3 to clean script and style tags --- frappe/utils/html_utils.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index e53983f3dc..091c6047db 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -17,13 +17,16 @@ EMOJI_PATTERN = re.compile( flags=re.UNICODE, ) +# tags for which content needs to be removed from output +REMOVE_CONTENT_TAGS = {"script", "style"} + def clean_html(html): if not isinstance(html, str): return html return nh3.clean( - clean_script_and_style(html), + html, tags={ "div", "p", @@ -42,6 +45,7 @@ def clean_html(html): "td", "tr", }, + clean_content_tags=REMOVE_CONTENT_TAGS, strip_comments=True, ) @@ -85,7 +89,7 @@ def clean_email_html(html): } return nh3.clean( - clean_script_and_style(html), + html, tags={ "div", "p", @@ -117,6 +121,7 @@ def clean_email_html(html): "img", }, attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}}, + clean_content_tags=REMOVE_CONTENT_TAGS, filter_style_properties=allowed_css_properties, strip_comments=True, url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}), @@ -124,7 +129,11 @@ def clean_email_html(html): def clean_script_and_style(html): - # remove script and style + """ + Remove script and style tags. + DEPRECATED: prefer nh3.clean's clean_content_tags parameter. + """ + from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html5lib") From bb9c565a0871f4fd4506bfa1c56480eb69b4bf76 Mon Sep 17 00:00:00 2001 From: Sagar Vora <16315650+sagarvora@users.noreply.github.com> Date: Fri, 23 Jan 2026 15:19:28 +0530 Subject: [PATCH 13/13] build: remove option types-bleach dependency --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 82a7c83a63..d9da51c3b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,6 @@ dev = [ "types-PyYAML", "types-Pygments", "types-beautifulsoup4", - "types-bleach", "types-cffi", "types-colorama", "types-croniter",