diff --git a/frappe/app.py b/frappe/app.py index 29a82ffb0a..6a07c9c223 100644 --- a/frappe/app.py +++ b/frappe/app.py @@ -40,7 +40,7 @@ import gettext import babel import babel.messages -import bleach +import nh3 import num2words import pydantic diff --git a/frappe/email/doctype/email_account/test_email_account.py b/frappe/email/doctype/email_account/test_email_account.py index d0d77a180f..c6f88c2862 100644 --- a/frappe/email/doctype/email_account/test_email_account.py +++ b/frappe/email/doctype/email_account/test_email_account.py @@ -132,7 +132,7 @@ class TestEmailAccount(IntegrationTestCase): TestEmailAccount.mocked_email_receive(email_account, messages) comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"}) - self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content) + self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content) self.assertTrue( "This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content ) @@ -153,7 +153,7 @@ class TestEmailAccount(IntegrationTestCase): TestEmailAccount.mocked_email_receive(email_account, messages) comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"}) - self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content) + self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content) self.assertTrue( "This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content ) diff --git a/frappe/tests/test_document.py b/frappe/tests/test_document.py index f1f3705470..8c4ed6ac01 100644 --- a/frappe/tests/test_document.py +++ b/frappe/tests/test_document.py @@ -262,16 +262,16 @@ class TestDocument(IntegrationTestCase): def test_xss_filter(self): d = self.test_insert() + subject = d.subject # script xss = '' - escaped_xss = xss.replace("<", "<").replace(">", ">") d.subject += xss d.save() d.reload() self.assertTrue(xss not in d.subject) - self.assertTrue(escaped_xss in d.subject) + self.assertEqual(subject, d.subject) # onload xss = '
Test
' diff --git a/frappe/tests/test_utils.py b/frappe/tests/test_utils.py index a87b7a3793..cd074696e0 100644 --- a/frappe/tests/test_utils.py +++ b/frappe/tests/test_utils.py @@ -508,7 +508,7 @@ class TestHTMLUtils(IntegrationTestCase): sample = """

Hello

Para

text""" clean = clean_email_html(sample) self.assertTrue("

Hello

" in clean) - self.assertTrue('text' in clean) + self.assertTrue('text' in clean) def test_sanitize_html(self): from frappe.utils.html_utils import sanitize_html diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py index 11b289e114..091c6047db 100644 --- a/frappe/utils/html_utils.py +++ b/frappe/utils/html_utils.py @@ -1,6 +1,7 @@ import json import re +import nh3 from bleach_allowlist import bleach_allowlist import frappe @@ -16,15 +17,16 @@ EMOJI_PATTERN = re.compile( flags=re.UNICODE, ) +# tags for which content needs to be removed from output +REMOVE_CONTENT_TAGS = {"script", "style"} + def clean_html(html): - import bleach - if not isinstance(html, str): return html - return bleach.clean( - clean_script_and_style(html), + return nh3.clean( + html, tags={ "div", "p", @@ -43,57 +45,51 @@ def clean_html(html): "td", "tr", }, - attributes=[], - strip=True, + clean_content_tags=REMOVE_CONTENT_TAGS, strip_comments=True, ) def clean_email_html(html): - import bleach - from bleach.css_sanitizer import CSSSanitizer - if not isinstance(html, str): return html - css_sanitizer = CSSSanitizer( - allowed_css_properties=[ - "color", - "border-color", - "width", - "height", - "max-width", - "background-color", - "border-collapse", - "border-radius", - "border", - "border-top", - "border-bottom", - "border-left", - "border-right", - "margin", - "margin-top", - "margin-bottom", - "margin-left", - "margin-right", - "padding", - "padding-top", - "padding-bottom", - "padding-left", - "padding-right", - "font-size", - "font-weight", - "font-family", - "text-decoration", - "line-height", - "text-align", - "vertical-align", - "display", - ] - ) + allowed_css_properties = { + "color", + "border-color", + "width", + "height", + "max-width", + "background-color", + "border-collapse", + "border-radius", + "border", + "border-top", + "border-bottom", + "border-left", + "border-right", + "margin", + "margin-top", + "margin-bottom", + "margin-left", + "margin-right", + "padding", + "padding-top", + "padding-bottom", + "padding-left", + "padding-right", + "font-size", + "font-weight", + "font-family", + "text-decoration", + "line-height", + "text-align", + "vertical-align", + "display", + } - return bleach.clean( - clean_script_and_style(html), + return nh3.clean( + html, tags={ "div", "p", @@ -124,16 +120,20 @@ def clean_email_html(html): "button", "img", }, - attributes=["border", "colspan", "rowspan", "src", "href", "style", "id"], - css_sanitizer=css_sanitizer, - protocols=["cid", "http", "https", "mailto", "data", "tel"], - strip=True, + attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}}, + clean_content_tags=REMOVE_CONTENT_TAGS, + filter_style_properties=allowed_css_properties, strip_comments=True, + url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}), ) def clean_script_and_style(html): - # remove script and style + """ + Remove script and style tags. + DEPRECATED: prefer nh3.clean's clean_content_tags parameter. + """ + from bs4 import BeautifulSoup soup = BeautifulSoup(html, "html5lib") @@ -145,12 +145,10 @@ def clean_script_and_style(html): def sanitize_html(html, linkify=False, always_sanitize=False): """ Sanitize HTML tags, attributes and style to prevent XSS attacks - Based on bleach clean, bleach whitelist and html5lib's Sanitizer defaults + Based on nh3 clean, bleach whitelist and html5lib's Sanitizer defaults Does not sanitize JSON unless explicitly specified, as it could lead to future problems """ - import bleach - from bleach.css_sanitizer import CSSSanitizer from bs4 import BeautifulSoup if not isinstance(html, str): @@ -164,28 +162,22 @@ def sanitize_html(html, linkify=False, always_sanitize=False): return html tags = ( - acceptable_elements - + svg_elements - + mathml_elements - + ["html", "head", "meta", "link", "body", "style", "o:p"] + acceptable_elements.union(svg_elements) + .union(mathml_elements) + .union(["html", "head", "meta", "link", "body", "o:p"]) ) - def attributes_filter(tag, name, value): - if name.startswith("data-"): - return True - return name in acceptable_attributes - - attributes = {"*": attributes_filter, "svg": svg_attributes} - css_sanitizer = CSSSanitizer(allowed_css_properties=bleach_allowlist.all_styles) + attributes = {"*": acceptable_attributes, "svg": svg_attributes} # returns html with escaped tags, escaped orphan >, <, etc. - escaped_html = bleach.clean( + escaped_html = nh3.clean( html, tags=tags, attributes=attributes, - css_sanitizer=css_sanitizer, + generic_attribute_prefixes={"data-"}, strip_comments=False, - protocols={"cid", "http", "https", "mailto", "tel"}, + filter_style_properties=set(bleach_allowlist.all_styles), + url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid"}), ) return escaped_html @@ -225,7 +217,7 @@ def unescape_html(value): # adapted from https://raw.githubusercontent.com/html5lib/html5lib-python/4aa79f113e7486c7ec5d15a6e1777bfe546d3259/html5lib/sanitizer.py -acceptable_elements = [ +acceptable_elements = { "a", "abbr", "acronym", @@ -327,9 +319,9 @@ acceptable_elements = [ "ul", "var", "video", -] +} -mathml_elements = [ +mathml_elements = { "maction", "math", "merror", @@ -357,9 +349,9 @@ mathml_elements = [ "munder", "munderover", "none", -] +} -svg_elements = [ +svg_elements = { "a", "animate", "animateColor", @@ -395,9 +387,9 @@ svg_elements = [ "title", "tspan", "use", -] +} -acceptable_attributes = [ +acceptable_attributes = { "abbr", "accept", "accept-charset", @@ -501,7 +493,6 @@ acceptable_attributes = [ "prompt", "radiogroup", "readonly", - "rel", "repeat-max", "repeat-min", "replace", @@ -559,16 +550,13 @@ acceptable_attributes = [ "itemtype", "itemid", "itemref", - "datetime", "data-is-group", -] +} -mathml_attributes = [ +mathml_attributes = { "actiontype", "align", "columnalign", - "columnalign", - "columnalign", "columnlines", "columnspacing", "columnspan", @@ -587,13 +575,10 @@ mathml_attributes = [ "mathbackground", "mathcolor", "mathvariant", - "mathvariant", "maxsize", "minsize", "other", "rowalign", - "rowalign", - "rowalign", "rowlines", "rowspacing", "rowspan", @@ -603,15 +588,14 @@ mathml_attributes = [ "separator", "stretchy", "width", - "width", "xlink:href", "xlink:show", "xlink:type", "xmlns", "xmlns:xlink", -] +} -svg_attributes = [ +svg_attributes = { "accent-height", "accumulate", "additive", @@ -754,4 +738,4 @@ svg_attributes = [ "y1", "y2", "zoomAndPan", -] +} diff --git a/pyproject.toml b/pyproject.toml index d8a8be1f34..336db872a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,14 +27,11 @@ dependencies = [ "PyYAML~=6.0.3", "RestrictedPython~=8.1", "WeasyPrint==68.0", - # we don't use tinycss2 directly, but pinned to ensure compatibility with WeasyPrint and bleach - "tinycss2~=1.5.1,<1.6", "pydyf==0.12.1", "Werkzeug==3.1.5", "Whoosh~=2.7.4", "beautifulsoup4~=4.13.5", "bleach-allowlist~=1.0.3", - "bleach~=6.3.0", "chardet~=5.2.0", "croniter~=6.0.0", "cryptography~=46.0.3", @@ -46,6 +43,7 @@ dependencies = [ "ldap3~=2.9.1", "markdown2~=2.5.4", "MarkupSafe~=3.0.3", + "nh3~=0.3.2", "num2words~=0.5.14", "oauthlib~=3.2.2", "openpyxl~=3.1.5", @@ -108,7 +106,6 @@ dev = [ "types-PyYAML", "types-Pygments", "types-beautifulsoup4", - "types-bleach", "types-cffi", "types-colorama", "types-croniter",