Merge pull request #34740 from alexleach/bleach-to-nh3

refactor!: Replace bleach HTML sanitiser for nh3
This commit is contained in:
Akhil Narang 2026-01-27 17:09:53 +05:30 committed by GitHub
commit fb56fbcab8
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 79 additions and 98 deletions

View file

@ -40,7 +40,7 @@ import gettext
import babel
import babel.messages
import bleach
import nh3
import num2words
import pydantic

View file

@ -132,7 +132,7 @@ class TestEmailAccount(IntegrationTestCase):
TestEmailAccount.mocked_email_receive(email_account, messages)
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
self.assertTrue(
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
)
@ -153,7 +153,7 @@ class TestEmailAccount(IntegrationTestCase):
TestEmailAccount.mocked_email_receive(email_account, messages)
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
self.assertTrue(
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
)

View file

@ -262,16 +262,16 @@ class TestDocument(IntegrationTestCase):
def test_xss_filter(self):
d = self.test_insert()
subject = d.subject
# script
xss = '<script>alert("XSS")</script>'
escaped_xss = xss.replace("<", "&lt;").replace(">", "&gt;")
d.subject += xss
d.save()
d.reload()
self.assertTrue(xss not in d.subject)
self.assertTrue(escaped_xss in d.subject)
self.assertEqual(subject, d.subject)
# onload
xss = '<div onload="alert("XSS")">Test</div>'

View file

@ -508,7 +508,7 @@ class TestHTMLUtils(IntegrationTestCase):
sample = """<h1>Hello</h1><p>Para</p><a href="http://test.com">text</a>"""
clean = clean_email_html(sample)
self.assertTrue("<h1>Hello</h1>" in clean)
self.assertTrue('<a href="http://test.com">text</a>' in clean)
self.assertTrue('<a href="http://test.com" rel="noopener noreferrer">text</a>' in clean)
def test_sanitize_html(self):
from frappe.utils.html_utils import sanitize_html

View file

@ -1,6 +1,7 @@
import json
import re
import nh3
from bleach_allowlist import bleach_allowlist
import frappe
@ -16,15 +17,16 @@ EMOJI_PATTERN = re.compile(
flags=re.UNICODE,
)
# tags for which content needs to be removed from output
REMOVE_CONTENT_TAGS = {"script", "style"}
def clean_html(html):
import bleach
if not isinstance(html, str):
return html
return bleach.clean(
clean_script_and_style(html),
return nh3.clean(
html,
tags={
"div",
"p",
@ -43,57 +45,51 @@ def clean_html(html):
"td",
"tr",
},
attributes=[],
strip=True,
clean_content_tags=REMOVE_CONTENT_TAGS,
strip_comments=True,
)
def clean_email_html(html):
import bleach
from bleach.css_sanitizer import CSSSanitizer
if not isinstance(html, str):
return html
css_sanitizer = CSSSanitizer(
allowed_css_properties=[
"color",
"border-color",
"width",
"height",
"max-width",
"background-color",
"border-collapse",
"border-radius",
"border",
"border-top",
"border-bottom",
"border-left",
"border-right",
"margin",
"margin-top",
"margin-bottom",
"margin-left",
"margin-right",
"padding",
"padding-top",
"padding-bottom",
"padding-left",
"padding-right",
"font-size",
"font-weight",
"font-family",
"text-decoration",
"line-height",
"text-align",
"vertical-align",
"display",
]
)
allowed_css_properties = {
"color",
"border-color",
"width",
"height",
"max-width",
"background-color",
"border-collapse",
"border-radius",
"border",
"border-top",
"border-bottom",
"border-left",
"border-right",
"margin",
"margin-top",
"margin-bottom",
"margin-left",
"margin-right",
"padding",
"padding-top",
"padding-bottom",
"padding-left",
"padding-right",
"font-size",
"font-weight",
"font-family",
"text-decoration",
"line-height",
"text-align",
"vertical-align",
"display",
}
return bleach.clean(
clean_script_and_style(html),
return nh3.clean(
html,
tags={
"div",
"p",
@ -124,16 +120,20 @@ def clean_email_html(html):
"button",
"img",
},
attributes=["border", "colspan", "rowspan", "src", "href", "style", "id"],
css_sanitizer=css_sanitizer,
protocols=["cid", "http", "https", "mailto", "data", "tel"],
strip=True,
attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}},
clean_content_tags=REMOVE_CONTENT_TAGS,
filter_style_properties=allowed_css_properties,
strip_comments=True,
url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}),
)
def clean_script_and_style(html):
# remove script and style
"""
Remove script and style tags.
DEPRECATED: prefer nh3.clean's clean_content_tags parameter.
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html5lib")
@ -145,12 +145,10 @@ def clean_script_and_style(html):
def sanitize_html(html, linkify=False, always_sanitize=False):
"""
Sanitize HTML tags, attributes and style to prevent XSS attacks
Based on bleach clean, bleach whitelist and html5lib's Sanitizer defaults
Based on nh3 clean, bleach whitelist and html5lib's Sanitizer defaults
Does not sanitize JSON unless explicitly specified, as it could lead to future problems
"""
import bleach
from bleach.css_sanitizer import CSSSanitizer
from bs4 import BeautifulSoup
if not isinstance(html, str):
@ -164,28 +162,22 @@ def sanitize_html(html, linkify=False, always_sanitize=False):
return html
tags = (
acceptable_elements
+ svg_elements
+ mathml_elements
+ ["html", "head", "meta", "link", "body", "style", "o:p"]
acceptable_elements.union(svg_elements)
.union(mathml_elements)
.union(["html", "head", "meta", "link", "body", "o:p"])
)
def attributes_filter(tag, name, value):
if name.startswith("data-"):
return True
return name in acceptable_attributes
attributes = {"*": attributes_filter, "svg": svg_attributes}
css_sanitizer = CSSSanitizer(allowed_css_properties=bleach_allowlist.all_styles)
attributes = {"*": acceptable_attributes, "svg": svg_attributes}
# returns html with escaped tags, escaped orphan >, <, etc.
escaped_html = bleach.clean(
escaped_html = nh3.clean(
html,
tags=tags,
attributes=attributes,
css_sanitizer=css_sanitizer,
generic_attribute_prefixes={"data-"},
strip_comments=False,
protocols={"cid", "http", "https", "mailto", "tel"},
filter_style_properties=set(bleach_allowlist.all_styles),
url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid"}),
)
return escaped_html
@ -225,7 +217,7 @@ def unescape_html(value):
# adapted from https://raw.githubusercontent.com/html5lib/html5lib-python/4aa79f113e7486c7ec5d15a6e1777bfe546d3259/html5lib/sanitizer.py
acceptable_elements = [
acceptable_elements = {
"a",
"abbr",
"acronym",
@ -327,9 +319,9 @@ acceptable_elements = [
"ul",
"var",
"video",
]
}
mathml_elements = [
mathml_elements = {
"maction",
"math",
"merror",
@ -357,9 +349,9 @@ mathml_elements = [
"munder",
"munderover",
"none",
]
}
svg_elements = [
svg_elements = {
"a",
"animate",
"animateColor",
@ -395,9 +387,9 @@ svg_elements = [
"title",
"tspan",
"use",
]
}
acceptable_attributes = [
acceptable_attributes = {
"abbr",
"accept",
"accept-charset",
@ -501,7 +493,6 @@ acceptable_attributes = [
"prompt",
"radiogroup",
"readonly",
"rel",
"repeat-max",
"repeat-min",
"replace",
@ -559,16 +550,13 @@ acceptable_attributes = [
"itemtype",
"itemid",
"itemref",
"datetime",
"data-is-group",
]
}
mathml_attributes = [
mathml_attributes = {
"actiontype",
"align",
"columnalign",
"columnalign",
"columnalign",
"columnlines",
"columnspacing",
"columnspan",
@ -587,13 +575,10 @@ mathml_attributes = [
"mathbackground",
"mathcolor",
"mathvariant",
"mathvariant",
"maxsize",
"minsize",
"other",
"rowalign",
"rowalign",
"rowalign",
"rowlines",
"rowspacing",
"rowspan",
@ -603,15 +588,14 @@ mathml_attributes = [
"separator",
"stretchy",
"width",
"width",
"xlink:href",
"xlink:show",
"xlink:type",
"xmlns",
"xmlns:xlink",
]
}
svg_attributes = [
svg_attributes = {
"accent-height",
"accumulate",
"additive",
@ -754,4 +738,4 @@ svg_attributes = [
"y1",
"y2",
"zoomAndPan",
]
}

View file

@ -27,14 +27,11 @@ dependencies = [
"PyYAML~=6.0.3",
"RestrictedPython~=8.1",
"WeasyPrint==68.0",
# we don't use tinycss2 directly, but pinned to ensure compatibility with WeasyPrint and bleach
"tinycss2~=1.5.1,<1.6",
"pydyf==0.12.1",
"Werkzeug==3.1.5",
"Whoosh~=2.7.4",
"beautifulsoup4~=4.13.5",
"bleach-allowlist~=1.0.3",
"bleach~=6.3.0",
"chardet~=5.2.0",
"croniter~=6.0.0",
"cryptography~=46.0.3",
@ -46,6 +43,7 @@ dependencies = [
"ldap3~=2.9.1",
"markdown2~=2.5.4",
"MarkupSafe~=3.0.3",
"nh3~=0.3.2",
"num2words~=0.5.14",
"oauthlib~=3.2.2",
"openpyxl~=3.1.5",
@ -108,7 +106,6 @@ dev = [
"types-PyYAML",
"types-Pygments",
"types-beautifulsoup4",
"types-bleach",
"types-cffi",
"types-colorama",
"types-croniter",