diff --git a/frappe/app.py b/frappe/app.py
index 29a82ffb0a..6a07c9c223 100644
--- a/frappe/app.py
+++ b/frappe/app.py
@@ -40,7 +40,7 @@ import gettext
import babel
import babel.messages
-import bleach
+import nh3
import num2words
import pydantic
diff --git a/frappe/email/doctype/email_account/test_email_account.py b/frappe/email/doctype/email_account/test_email_account.py
index d0d77a180f..c6f88c2862 100644
--- a/frappe/email/doctype/email_account/test_email_account.py
+++ b/frappe/email/doctype/email_account/test_email_account.py
@@ -132,7 +132,7 @@ class TestEmailAccount(IntegrationTestCase):
TestEmailAccount.mocked_email_receive(email_account, messages)
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
- self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
+ self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
self.assertTrue(
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
)
@@ -153,7 +153,7 @@ class TestEmailAccount(IntegrationTestCase):
TestEmailAccount.mocked_email_receive(email_account, messages)
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
- self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
+ self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
self.assertTrue(
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
)
diff --git a/frappe/tests/test_document.py b/frappe/tests/test_document.py
index f1f3705470..8c4ed6ac01 100644
--- a/frappe/tests/test_document.py
+++ b/frappe/tests/test_document.py
@@ -262,16 +262,16 @@ class TestDocument(IntegrationTestCase):
def test_xss_filter(self):
d = self.test_insert()
+ subject = d.subject
# script
xss = ''
- escaped_xss = xss.replace("<", "<").replace(">", ">")
d.subject += xss
d.save()
d.reload()
self.assertTrue(xss not in d.subject)
- self.assertTrue(escaped_xss in d.subject)
+ self.assertEqual(subject, d.subject)
# onload
xss = '
Test
'
diff --git a/frappe/tests/test_utils.py b/frappe/tests/test_utils.py
index a87b7a3793..cd074696e0 100644
--- a/frappe/tests/test_utils.py
+++ b/frappe/tests/test_utils.py
@@ -508,7 +508,7 @@ class TestHTMLUtils(IntegrationTestCase):
sample = """Hello
Para
text"""
clean = clean_email_html(sample)
self.assertTrue("Hello
" in clean)
- self.assertTrue('text' in clean)
+ self.assertTrue('text' in clean)
def test_sanitize_html(self):
from frappe.utils.html_utils import sanitize_html
diff --git a/frappe/utils/html_utils.py b/frappe/utils/html_utils.py
index 11b289e114..091c6047db 100644
--- a/frappe/utils/html_utils.py
+++ b/frappe/utils/html_utils.py
@@ -1,6 +1,7 @@
import json
import re
+import nh3
from bleach_allowlist import bleach_allowlist
import frappe
@@ -16,15 +17,16 @@ EMOJI_PATTERN = re.compile(
flags=re.UNICODE,
)
+# tags for which content needs to be removed from output
+REMOVE_CONTENT_TAGS = {"script", "style"}
+
def clean_html(html):
- import bleach
-
if not isinstance(html, str):
return html
- return bleach.clean(
- clean_script_and_style(html),
+ return nh3.clean(
+ html,
tags={
"div",
"p",
@@ -43,57 +45,51 @@ def clean_html(html):
"td",
"tr",
},
- attributes=[],
- strip=True,
+ clean_content_tags=REMOVE_CONTENT_TAGS,
strip_comments=True,
)
def clean_email_html(html):
- import bleach
- from bleach.css_sanitizer import CSSSanitizer
-
if not isinstance(html, str):
return html
- css_sanitizer = CSSSanitizer(
- allowed_css_properties=[
- "color",
- "border-color",
- "width",
- "height",
- "max-width",
- "background-color",
- "border-collapse",
- "border-radius",
- "border",
- "border-top",
- "border-bottom",
- "border-left",
- "border-right",
- "margin",
- "margin-top",
- "margin-bottom",
- "margin-left",
- "margin-right",
- "padding",
- "padding-top",
- "padding-bottom",
- "padding-left",
- "padding-right",
- "font-size",
- "font-weight",
- "font-family",
- "text-decoration",
- "line-height",
- "text-align",
- "vertical-align",
- "display",
- ]
- )
+ allowed_css_properties = {
+ "color",
+ "border-color",
+ "width",
+ "height",
+ "max-width",
+ "background-color",
+ "border-collapse",
+ "border-radius",
+ "border",
+ "border-top",
+ "border-bottom",
+ "border-left",
+ "border-right",
+ "margin",
+ "margin-top",
+ "margin-bottom",
+ "margin-left",
+ "margin-right",
+ "padding",
+ "padding-top",
+ "padding-bottom",
+ "padding-left",
+ "padding-right",
+ "font-size",
+ "font-weight",
+ "font-family",
+ "text-decoration",
+ "line-height",
+ "text-align",
+ "vertical-align",
+ "display",
+ }
- return bleach.clean(
- clean_script_and_style(html),
+ return nh3.clean(
+ html,
tags={
"div",
"p",
@@ -124,16 +120,20 @@ def clean_email_html(html):
"button",
"img",
},
- attributes=["border", "colspan", "rowspan", "src", "href", "style", "id"],
- css_sanitizer=css_sanitizer,
- protocols=["cid", "http", "https", "mailto", "data", "tel"],
- strip=True,
+ attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}},
+ clean_content_tags=REMOVE_CONTENT_TAGS,
+ filter_style_properties=allowed_css_properties,
strip_comments=True,
+ url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}),
)
def clean_script_and_style(html):
- # remove script and style
+ """
+ Remove script and style tags.
+ DEPRECATED: prefer nh3.clean's clean_content_tags parameter.
+ """
+
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html5lib")
@@ -145,12 +145,10 @@ def clean_script_and_style(html):
def sanitize_html(html, linkify=False, always_sanitize=False):
"""
Sanitize HTML tags, attributes and style to prevent XSS attacks
- Based on bleach clean, bleach whitelist and html5lib's Sanitizer defaults
+ Based on nh3 clean, bleach whitelist and html5lib's Sanitizer defaults
Does not sanitize JSON unless explicitly specified, as it could lead to future problems
"""
- import bleach
- from bleach.css_sanitizer import CSSSanitizer
from bs4 import BeautifulSoup
if not isinstance(html, str):
@@ -164,28 +162,22 @@ def sanitize_html(html, linkify=False, always_sanitize=False):
return html
tags = (
- acceptable_elements
- + svg_elements
- + mathml_elements
- + ["html", "head", "meta", "link", "body", "style", "o:p"]
+ acceptable_elements.union(svg_elements)
+ .union(mathml_elements)
+ .union(["html", "head", "meta", "link", "body", "o:p"])
)
- def attributes_filter(tag, name, value):
- if name.startswith("data-"):
- return True
- return name in acceptable_attributes
-
- attributes = {"*": attributes_filter, "svg": svg_attributes}
- css_sanitizer = CSSSanitizer(allowed_css_properties=bleach_allowlist.all_styles)
+ attributes = {"*": acceptable_attributes, "svg": svg_attributes}
# returns html with escaped tags, escaped orphan >, <, etc.
- escaped_html = bleach.clean(
+ escaped_html = nh3.clean(
html,
tags=tags,
attributes=attributes,
- css_sanitizer=css_sanitizer,
+ generic_attribute_prefixes={"data-"},
strip_comments=False,
- protocols={"cid", "http", "https", "mailto", "tel"},
+ filter_style_properties=set(bleach_allowlist.all_styles),
+ url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid"}),
)
return escaped_html
@@ -225,7 +217,7 @@ def unescape_html(value):
# adapted from https://raw.githubusercontent.com/html5lib/html5lib-python/4aa79f113e7486c7ec5d15a6e1777bfe546d3259/html5lib/sanitizer.py
-acceptable_elements = [
+acceptable_elements = {
"a",
"abbr",
"acronym",
@@ -327,9 +319,9 @@ acceptable_elements = [
"ul",
"var",
"video",
-]
+}
-mathml_elements = [
+mathml_elements = {
"maction",
"math",
"merror",
@@ -357,9 +349,9 @@ mathml_elements = [
"munder",
"munderover",
"none",
-]
+}
-svg_elements = [
+svg_elements = {
"a",
"animate",
"animateColor",
@@ -395,9 +387,9 @@ svg_elements = [
"title",
"tspan",
"use",
-]
+}
-acceptable_attributes = [
+acceptable_attributes = {
"abbr",
"accept",
"accept-charset",
@@ -501,7 +493,6 @@ acceptable_attributes = [
"prompt",
"radiogroup",
"readonly",
- "rel",
"repeat-max",
"repeat-min",
"replace",
@@ -559,16 +550,13 @@ acceptable_attributes = [
"itemtype",
"itemid",
"itemref",
- "datetime",
"data-is-group",
-]
+}
-mathml_attributes = [
+mathml_attributes = {
"actiontype",
"align",
"columnalign",
- "columnalign",
- "columnalign",
"columnlines",
"columnspacing",
"columnspan",
@@ -587,13 +575,10 @@ mathml_attributes = [
"mathbackground",
"mathcolor",
"mathvariant",
- "mathvariant",
"maxsize",
"minsize",
"other",
"rowalign",
- "rowalign",
- "rowalign",
"rowlines",
"rowspacing",
"rowspan",
@@ -603,15 +588,14 @@ mathml_attributes = [
"separator",
"stretchy",
"width",
- "width",
"xlink:href",
"xlink:show",
"xlink:type",
"xmlns",
"xmlns:xlink",
-]
+}
-svg_attributes = [
+svg_attributes = {
"accent-height",
"accumulate",
"additive",
@@ -754,4 +738,4 @@ svg_attributes = [
"y1",
"y2",
"zoomAndPan",
-]
+}
diff --git a/pyproject.toml b/pyproject.toml
index d8a8be1f34..336db872a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,14 +27,11 @@ dependencies = [
"PyYAML~=6.0.3",
"RestrictedPython~=8.1",
"WeasyPrint==68.0",
- # we don't use tinycss2 directly, but pinned to ensure compatibility with WeasyPrint and bleach
- "tinycss2~=1.5.1,<1.6",
"pydyf==0.12.1",
"Werkzeug==3.1.5",
"Whoosh~=2.7.4",
"beautifulsoup4~=4.13.5",
"bleach-allowlist~=1.0.3",
- "bleach~=6.3.0",
"chardet~=5.2.0",
"croniter~=6.0.0",
"cryptography~=46.0.3",
@@ -46,6 +43,7 @@ dependencies = [
"ldap3~=2.9.1",
"markdown2~=2.5.4",
"MarkupSafe~=3.0.3",
+ "nh3~=0.3.2",
"num2words~=0.5.14",
"oauthlib~=3.2.2",
"openpyxl~=3.1.5",
@@ -108,7 +106,6 @@ dev = [
"types-PyYAML",
"types-Pygments",
"types-beautifulsoup4",
- "types-bleach",
"types-cffi",
"types-colorama",
"types-croniter",