From 7f78140015d6e2c9bac86ba9931911009c407dba Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:14:22 -0700 Subject: [PATCH 1/5] Uses library for html cleanup --- bookwyrm/models/fields.py | 6 +-- bookwyrm/sanitize_html.py | 71 ---------------------------- bookwyrm/tests/test_sanitize_html.py | 36 ++++++-------- bookwyrm/utils/sanitizer.py | 25 ++++++++++ bookwyrm/views/status.py | 7 +-- requirements.txt | 1 + 6 files changed, 44 insertions(+), 102 deletions(-) delete mode 100644 bookwyrm/sanitize_html.py create mode 100644 bookwyrm/utils/sanitizer.py diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index 62c61cc405..785f3397c9 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -16,7 +16,7 @@ from bookwyrm import activitypub from bookwyrm.connectors import get_image -from bookwyrm.sanitize_html import InputHtmlParser +from bookwyrm.utils.sanitizer import clean from bookwyrm.settings import MEDIA_FULL_URL @@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField): def field_from_activity(self, value): if not value or value == MISSING: return None - sanitizer = InputHtmlParser() - sanitizer.feed(value) - return sanitizer.get_output() + return clean(value) class ArrayField(ActivitypubFieldMixin, DjangoArrayField): diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py deleted file mode 100644 index 4edd2818eb..0000000000 --- a/bookwyrm/sanitize_html.py +++ /dev/null @@ -1,71 +0,0 @@ -""" html parser to clean up incoming text from unknown sources """ -from html.parser import HTMLParser - - -class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method - """Removes any html that isn't allowed_tagsed from a block""" - - def __init__(self): - HTMLParser.__init__(self) - self.allowed_tags = [ - "p", - "blockquote", - "br", - "b", - "i", - "strong", - "em", - "pre", - "a", - "span", - "ul", - "ol", - "li", - ] - self.allowed_attrs = ["href", "rel", "src", "alt"] - self.tag_stack = [] - self.output = [] - # if the html appears invalid, we just won't allow any at all - self.allow_html = True - - def handle_starttag(self, tag, attrs): - """check if the tag is valid""" - if self.allow_html and tag in self.allowed_tags: - allowed_attrs = " ".join( - f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs - ) - reconstructed = f"<{tag}" - if allowed_attrs: - reconstructed += " " + allowed_attrs - reconstructed += ">" - self.output.append(("tag", reconstructed)) - self.tag_stack.append(tag) - else: - self.output.append(("data", "")) - - def handle_endtag(self, tag): - """keep the close tag""" - if not self.allow_html or tag not in self.allowed_tags: - self.output.append(("data", "")) - return - - if not self.tag_stack or self.tag_stack[-1] != tag: - # the end tag doesn't match the most recent start tag - self.allow_html = False - self.output.append(("data", "")) - return - - self.tag_stack = self.tag_stack[:-1] - self.output.append(("tag", f"")) - - def handle_data(self, data): - """extract the answer, if we're in an answer tag""" - self.output.append(("data", data)) - - def get_output(self): - """convert the output from a list of tuples to a string""" - if self.tag_stack: - self.allow_html = False - if not self.allow_html: - return "".join(v for (k, v) in self.output if k == "data") - return "".join(v for (k, v) in self.output) diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index 5814f22071..ecdd697937 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -1,7 +1,7 @@ """ make sure only valid html gets to the app """ from django.test import TestCase -from bookwyrm.sanitize_html import InputHtmlParser +from bookwyrm.utils.sanitizer import clean class Sanitizer(TestCase): @@ -10,53 +10,45 @@ class Sanitizer(TestCase): def test_no_html(self): """just text""" input_text = "no html " - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html(self): """leave the html untouched""" input_text = "yes html" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html_attrs(self): """and don't remove useful attributes""" input_text = 'yes html' - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(input_text, output) def test_valid_html_invalid_attrs(self): """do remove un-approved attributes""" input_text = 'yes html' - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(output, 'yes html') def test_invalid_html(self): """remove all html when the html is malformed""" input_text = "yes html" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual("yes html", output) input_text = "yes html " - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual("yes html ", output) def test_disallowed_html(self): """remove disallowed html but keep allowed html""" input_text = "
yes html
" - parser = InputHtmlParser() - parser.feed(input_text) - output = parser.get_output() + output = clean(input_text) self.assertEqual(" yes html", output) + + def test_escaped_bracket(self): + """remove > and <""" + input_text = "<dev>hi</div>" + output = clean(input_text) + self.assertEqual("hi", output) diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py new file mode 100644 index 0000000000..676921949f --- /dev/null +++ b/bookwyrm/utils/sanitizer.py @@ -0,0 +1,25 @@ +"""Clean user-provided text""" +import bleach + + +def clean(input_text): + """Run through "bleach" """ + return bleach.clean( + input_text, + tags=[ + "p", + "blockquote", + "br", + "b", + "i", + "strong", + "em", + "pre", + "a", + "span", + "ul", + "ol", + "li", + ], + attributes=["href", "rel", "src", "alt"], + ) diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 670ea57173..0dd9e0f800 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -16,9 +16,8 @@ from markdown import markdown from bookwyrm import forms, models -from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.settings import DOMAIN -from bookwyrm.utils import regex +from bookwyrm.utils import regex, sanitizer from .helpers import handle_remote_webfinger, is_api_request from .helpers import load_date_in_user_tz_as_utc @@ -268,6 +267,4 @@ def to_markdown(content): content = format_links(content) content = markdown(content) # sanitize resulting html - sanitizer = InputHtmlParser() - sanitizer.feed(content) - return sanitizer.get_output() + return sanitizer.clean(content) diff --git a/requirements.txt b/requirements.txt index 0155782cc8..3d9f686aeb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ aiohttp==3.8.1 +bleach==5.0.1 celery==5.2.2 colorthief==0.2.1 Django==3.2.13 From 62aa4bf869625186e0c7f24d259cfb2caac6f4d3 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:21:18 -0700 Subject: [PATCH 2/5] Tick version number --- bookwyrm/settings.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py index dc0d71f30a..e67fb5e1e0 100644 --- a/bookwyrm/settings.py +++ b/bookwyrm/settings.py @@ -11,7 +11,7 @@ env = Env() env.read_env() DOMAIN = env("DOMAIN") -VERSION = "0.4.0" +VERSION = "0.4.1" RELEASE_API = env( "RELEASE_API", From 13376f89708fea095d97fdeed6950da701aa52d9 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:24:13 -0700 Subject: [PATCH 3/5] Catches missing reference to previous sanitizer --- bookwyrm/status.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bookwyrm/status.py b/bookwyrm/status.py index 09fbdc06e6..de7682ee72 100644 --- a/bookwyrm/status.py +++ b/bookwyrm/status.py @@ -2,15 +2,13 @@ from django.db import transaction from bookwyrm import models -from bookwyrm.sanitize_html import InputHtmlParser +from bookwyrm.utils import sanitizer def create_generated_note(user, content, mention_books=None, privacy="public"): """a note created by the app about user activity""" # sanitize input html - parser = InputHtmlParser() - parser.feed(content) - content = parser.get_output() + content = sanitizer.clean(content) with transaction.atomic(): # create but don't save From 70beb24d95657d5186d73d71599088046dd5e891 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:34:09 -0700 Subject: [PATCH 4/5] Removed misleading test This wasn't really testing what I wanted it to. --- bookwyrm/tests/test_sanitize_html.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index ecdd697937..ca1643e8f5 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -46,9 +46,3 @@ def test_disallowed_html(self): input_text = "
yes html
" output = clean(input_text) self.assertEqual(" yes html", output) - - def test_escaped_bracket(self): - """remove > and <""" - input_text = "<dev>hi</div>" - output = clean(input_text) - self.assertEqual("hi", output) From 9d9b7f366a2b6b711c5b6f3609286a26f98486de Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 4 Jul 2022 13:45:28 -0700 Subject: [PATCH 5/5] Use "strip" in bleach This removes forbidden html, rather than leaving them in place but unrendered. --- bookwyrm/tests/test_sanitize_html.py | 6 +++--- bookwyrm/utils/sanitizer.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index ca1643e8f5..449acdafbe 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -32,14 +32,14 @@ def test_valid_html_invalid_attrs(self): self.assertEqual(output, 'yes html') def test_invalid_html(self): - """remove all html when the html is malformed""" + """don't allow malformed html""" input_text = "yes html" output = clean(input_text) - self.assertEqual("yes html", output) + self.assertEqual("yes html", output) input_text = "yes html " output = clean(input_text) - self.assertEqual("yes html ", output) + self.assertEqual("yes html ", output) def test_disallowed_html(self): """remove disallowed html but keep allowed html""" diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py index 676921949f..f6c87358cb 100644 --- a/bookwyrm/utils/sanitizer.py +++ b/bookwyrm/utils/sanitizer.py @@ -22,4 +22,5 @@ def clean(input_text): "li", ], attributes=["href", "rel", "src", "alt"], + strip=True, )