Merge pull request #34740 from alexleach/bleach-to-nh3
refactor!: Replace bleach HTML sanitiser for nh3
This commit is contained in:
commit
fb56fbcab8
6 changed files with 79 additions and 98 deletions
|
|
@ -40,7 +40,7 @@ import gettext
|
|||
|
||||
import babel
|
||||
import babel.messages
|
||||
import bleach
|
||||
import nh3
|
||||
import num2words
|
||||
import pydantic
|
||||
|
||||
|
|
|
|||
|
|
@ -132,7 +132,7 @@ class TestEmailAccount(IntegrationTestCase):
|
|||
TestEmailAccount.mocked_email_receive(email_account, messages)
|
||||
|
||||
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
|
||||
self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
|
||||
self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
|
||||
self.assertTrue(
|
||||
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
|
||||
)
|
||||
|
|
@ -153,7 +153,7 @@ class TestEmailAccount(IntegrationTestCase):
|
|||
TestEmailAccount.mocked_email_receive(email_account, messages)
|
||||
|
||||
comm = frappe.get_doc("Communication", {"sender": "test_sender@example.com"})
|
||||
self.assertTrue("From: "Microsoft Outlook" <test_sender@example.com>" in comm.content)
|
||||
self.assertTrue('From: "Microsoft Outlook" <test_sender@example.com>' in comm.content)
|
||||
self.assertTrue(
|
||||
"This is an e-mail message sent automatically by Microsoft Outlook while" in comm.content
|
||||
)
|
||||
|
|
|
|||
|
|
@ -262,16 +262,16 @@ class TestDocument(IntegrationTestCase):
|
|||
|
||||
def test_xss_filter(self):
|
||||
d = self.test_insert()
|
||||
subject = d.subject
|
||||
|
||||
# script
|
||||
xss = '<script>alert("XSS")</script>'
|
||||
escaped_xss = xss.replace("<", "<").replace(">", ">")
|
||||
d.subject += xss
|
||||
d.save()
|
||||
d.reload()
|
||||
|
||||
self.assertTrue(xss not in d.subject)
|
||||
self.assertTrue(escaped_xss in d.subject)
|
||||
self.assertEqual(subject, d.subject)
|
||||
|
||||
# onload
|
||||
xss = '<div onload="alert("XSS")">Test</div>'
|
||||
|
|
|
|||
|
|
@ -508,7 +508,7 @@ class TestHTMLUtils(IntegrationTestCase):
|
|||
sample = """<h1>Hello</h1><p>Para</p><a href="http://test.com">text</a>"""
|
||||
clean = clean_email_html(sample)
|
||||
self.assertTrue("<h1>Hello</h1>" in clean)
|
||||
self.assertTrue('<a href="http://test.com">text</a>' in clean)
|
||||
self.assertTrue('<a href="http://test.com" rel="noopener noreferrer">text</a>' in clean)
|
||||
|
||||
def test_sanitize_html(self):
|
||||
from frappe.utils.html_utils import sanitize_html
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
import json
|
||||
import re
|
||||
|
||||
import nh3
|
||||
from bleach_allowlist import bleach_allowlist
|
||||
|
||||
import frappe
|
||||
|
|
@ -16,15 +17,16 @@ EMOJI_PATTERN = re.compile(
|
|||
flags=re.UNICODE,
|
||||
)
|
||||
|
||||
# tags for which content needs to be removed from output
|
||||
REMOVE_CONTENT_TAGS = {"script", "style"}
|
||||
|
||||
|
||||
def clean_html(html):
|
||||
import bleach
|
||||
|
||||
if not isinstance(html, str):
|
||||
return html
|
||||
|
||||
return bleach.clean(
|
||||
clean_script_and_style(html),
|
||||
return nh3.clean(
|
||||
html,
|
||||
tags={
|
||||
"div",
|
||||
"p",
|
||||
|
|
@ -43,57 +45,51 @@ def clean_html(html):
|
|||
"td",
|
||||
"tr",
|
||||
},
|
||||
attributes=[],
|
||||
strip=True,
|
||||
clean_content_tags=REMOVE_CONTENT_TAGS,
|
||||
strip_comments=True,
|
||||
)
|
||||
|
||||
|
||||
def clean_email_html(html):
|
||||
import bleach
|
||||
from bleach.css_sanitizer import CSSSanitizer
|
||||
|
||||
if not isinstance(html, str):
|
||||
return html
|
||||
|
||||
css_sanitizer = CSSSanitizer(
|
||||
allowed_css_properties=[
|
||||
"color",
|
||||
"border-color",
|
||||
"width",
|
||||
"height",
|
||||
"max-width",
|
||||
"background-color",
|
||||
"border-collapse",
|
||||
"border-radius",
|
||||
"border",
|
||||
"border-top",
|
||||
"border-bottom",
|
||||
"border-left",
|
||||
"border-right",
|
||||
"margin",
|
||||
"margin-top",
|
||||
"margin-bottom",
|
||||
"margin-left",
|
||||
"margin-right",
|
||||
"padding",
|
||||
"padding-top",
|
||||
"padding-bottom",
|
||||
"padding-left",
|
||||
"padding-right",
|
||||
"font-size",
|
||||
"font-weight",
|
||||
"font-family",
|
||||
"text-decoration",
|
||||
"line-height",
|
||||
"text-align",
|
||||
"vertical-align",
|
||||
"display",
|
||||
]
|
||||
)
|
||||
allowed_css_properties = {
|
||||
"color",
|
||||
"border-color",
|
||||
"width",
|
||||
"height",
|
||||
"max-width",
|
||||
"background-color",
|
||||
"border-collapse",
|
||||
"border-radius",
|
||||
"border",
|
||||
"border-top",
|
||||
"border-bottom",
|
||||
"border-left",
|
||||
"border-right",
|
||||
"margin",
|
||||
"margin-top",
|
||||
"margin-bottom",
|
||||
"margin-left",
|
||||
"margin-right",
|
||||
"padding",
|
||||
"padding-top",
|
||||
"padding-bottom",
|
||||
"padding-left",
|
||||
"padding-right",
|
||||
"font-size",
|
||||
"font-weight",
|
||||
"font-family",
|
||||
"text-decoration",
|
||||
"line-height",
|
||||
"text-align",
|
||||
"vertical-align",
|
||||
"display",
|
||||
}
|
||||
|
||||
return bleach.clean(
|
||||
clean_script_and_style(html),
|
||||
return nh3.clean(
|
||||
html,
|
||||
tags={
|
||||
"div",
|
||||
"p",
|
||||
|
|
@ -124,16 +120,20 @@ def clean_email_html(html):
|
|||
"button",
|
||||
"img",
|
||||
},
|
||||
attributes=["border", "colspan", "rowspan", "src", "href", "style", "id"],
|
||||
css_sanitizer=css_sanitizer,
|
||||
protocols=["cid", "http", "https", "mailto", "data", "tel"],
|
||||
strip=True,
|
||||
attributes={"*": {"border", "colspan", "rowspan", "src", "href", "style", "id"}},
|
||||
clean_content_tags=REMOVE_CONTENT_TAGS,
|
||||
filter_style_properties=allowed_css_properties,
|
||||
strip_comments=True,
|
||||
url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid", "data"}),
|
||||
)
|
||||
|
||||
|
||||
def clean_script_and_style(html):
|
||||
# remove script and style
|
||||
"""
|
||||
Remove script and style tags.
|
||||
DEPRECATED: prefer nh3.clean's clean_content_tags parameter.
|
||||
"""
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(html, "html5lib")
|
||||
|
|
@ -145,12 +145,10 @@ def clean_script_and_style(html):
|
|||
def sanitize_html(html, linkify=False, always_sanitize=False):
|
||||
"""
|
||||
Sanitize HTML tags, attributes and style to prevent XSS attacks
|
||||
Based on bleach clean, bleach whitelist and html5lib's Sanitizer defaults
|
||||
Based on nh3 clean, bleach whitelist and html5lib's Sanitizer defaults
|
||||
|
||||
Does not sanitize JSON unless explicitly specified, as it could lead to future problems
|
||||
"""
|
||||
import bleach
|
||||
from bleach.css_sanitizer import CSSSanitizer
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
if not isinstance(html, str):
|
||||
|
|
@ -164,28 +162,22 @@ def sanitize_html(html, linkify=False, always_sanitize=False):
|
|||
return html
|
||||
|
||||
tags = (
|
||||
acceptable_elements
|
||||
+ svg_elements
|
||||
+ mathml_elements
|
||||
+ ["html", "head", "meta", "link", "body", "style", "o:p"]
|
||||
acceptable_elements.union(svg_elements)
|
||||
.union(mathml_elements)
|
||||
.union(["html", "head", "meta", "link", "body", "o:p"])
|
||||
)
|
||||
|
||||
def attributes_filter(tag, name, value):
|
||||
if name.startswith("data-"):
|
||||
return True
|
||||
return name in acceptable_attributes
|
||||
|
||||
attributes = {"*": attributes_filter, "svg": svg_attributes}
|
||||
css_sanitizer = CSSSanitizer(allowed_css_properties=bleach_allowlist.all_styles)
|
||||
attributes = {"*": acceptable_attributes, "svg": svg_attributes}
|
||||
|
||||
# returns html with escaped tags, escaped orphan >, <, etc.
|
||||
escaped_html = bleach.clean(
|
||||
escaped_html = nh3.clean(
|
||||
html,
|
||||
tags=tags,
|
||||
attributes=attributes,
|
||||
css_sanitizer=css_sanitizer,
|
||||
generic_attribute_prefixes={"data-"},
|
||||
strip_comments=False,
|
||||
protocols={"cid", "http", "https", "mailto", "tel"},
|
||||
filter_style_properties=set(bleach_allowlist.all_styles),
|
||||
url_schemes=nh3.ALLOWED_URL_SCHEMES.union({"cid"}),
|
||||
)
|
||||
|
||||
return escaped_html
|
||||
|
|
@ -225,7 +217,7 @@ def unescape_html(value):
|
|||
|
||||
|
||||
# adapted from https://raw.githubusercontent.com/html5lib/html5lib-python/4aa79f113e7486c7ec5d15a6e1777bfe546d3259/html5lib/sanitizer.py
|
||||
acceptable_elements = [
|
||||
acceptable_elements = {
|
||||
"a",
|
||||
"abbr",
|
||||
"acronym",
|
||||
|
|
@ -327,9 +319,9 @@ acceptable_elements = [
|
|||
"ul",
|
||||
"var",
|
||||
"video",
|
||||
]
|
||||
}
|
||||
|
||||
mathml_elements = [
|
||||
mathml_elements = {
|
||||
"maction",
|
||||
"math",
|
||||
"merror",
|
||||
|
|
@ -357,9 +349,9 @@ mathml_elements = [
|
|||
"munder",
|
||||
"munderover",
|
||||
"none",
|
||||
]
|
||||
}
|
||||
|
||||
svg_elements = [
|
||||
svg_elements = {
|
||||
"a",
|
||||
"animate",
|
||||
"animateColor",
|
||||
|
|
@ -395,9 +387,9 @@ svg_elements = [
|
|||
"title",
|
||||
"tspan",
|
||||
"use",
|
||||
]
|
||||
}
|
||||
|
||||
acceptable_attributes = [
|
||||
acceptable_attributes = {
|
||||
"abbr",
|
||||
"accept",
|
||||
"accept-charset",
|
||||
|
|
@ -501,7 +493,6 @@ acceptable_attributes = [
|
|||
"prompt",
|
||||
"radiogroup",
|
||||
"readonly",
|
||||
"rel",
|
||||
"repeat-max",
|
||||
"repeat-min",
|
||||
"replace",
|
||||
|
|
@ -559,16 +550,13 @@ acceptable_attributes = [
|
|||
"itemtype",
|
||||
"itemid",
|
||||
"itemref",
|
||||
"datetime",
|
||||
"data-is-group",
|
||||
]
|
||||
}
|
||||
|
||||
mathml_attributes = [
|
||||
mathml_attributes = {
|
||||
"actiontype",
|
||||
"align",
|
||||
"columnalign",
|
||||
"columnalign",
|
||||
"columnalign",
|
||||
"columnlines",
|
||||
"columnspacing",
|
||||
"columnspan",
|
||||
|
|
@ -587,13 +575,10 @@ mathml_attributes = [
|
|||
"mathbackground",
|
||||
"mathcolor",
|
||||
"mathvariant",
|
||||
"mathvariant",
|
||||
"maxsize",
|
||||
"minsize",
|
||||
"other",
|
||||
"rowalign",
|
||||
"rowalign",
|
||||
"rowalign",
|
||||
"rowlines",
|
||||
"rowspacing",
|
||||
"rowspan",
|
||||
|
|
@ -603,15 +588,14 @@ mathml_attributes = [
|
|||
"separator",
|
||||
"stretchy",
|
||||
"width",
|
||||
"width",
|
||||
"xlink:href",
|
||||
"xlink:show",
|
||||
"xlink:type",
|
||||
"xmlns",
|
||||
"xmlns:xlink",
|
||||
]
|
||||
}
|
||||
|
||||
svg_attributes = [
|
||||
svg_attributes = {
|
||||
"accent-height",
|
||||
"accumulate",
|
||||
"additive",
|
||||
|
|
@ -754,4 +738,4 @@ svg_attributes = [
|
|||
"y1",
|
||||
"y2",
|
||||
"zoomAndPan",
|
||||
]
|
||||
}
|
||||
|
|
|
|||
|
|
@ -27,14 +27,11 @@ dependencies = [
|
|||
"PyYAML~=6.0.3",
|
||||
"RestrictedPython~=8.1",
|
||||
"WeasyPrint==68.0",
|
||||
# we don't use tinycss2 directly, but pinned to ensure compatibility with WeasyPrint and bleach
|
||||
"tinycss2~=1.5.1,<1.6",
|
||||
"pydyf==0.12.1",
|
||||
"Werkzeug==3.1.5",
|
||||
"Whoosh~=2.7.4",
|
||||
"beautifulsoup4~=4.13.5",
|
||||
"bleach-allowlist~=1.0.3",
|
||||
"bleach~=6.3.0",
|
||||
"chardet~=5.2.0",
|
||||
"croniter~=6.0.0",
|
||||
"cryptography~=46.0.3",
|
||||
|
|
@ -46,6 +43,7 @@ dependencies = [
|
|||
"ldap3~=2.9.1",
|
||||
"markdown2~=2.5.4",
|
||||
"MarkupSafe~=3.0.3",
|
||||
"nh3~=0.3.2",
|
||||
"num2words~=0.5.14",
|
||||
"oauthlib~=3.2.2",
|
||||
"openpyxl~=3.1.5",
|
||||
|
|
@ -108,7 +106,6 @@ dev = [
|
|||
"types-PyYAML",
|
||||
"types-Pygments",
|
||||
"types-beautifulsoup4",
|
||||
"types-bleach",
|
||||
"types-cffi",
|
||||
"types-colorama",
|
||||
"types-croniter",
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue