From 7f78140015d6e2c9bac86ba9931911009c407dba Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:14:22 -0700
Subject: [PATCH 1/5] Uses library for html cleanup

---
 bookwyrm/models/fields.py            |  6 +--
 bookwyrm/sanitize_html.py            | 71 ----------------------------
 bookwyrm/tests/test_sanitize_html.py | 36 ++++++--------
 bookwyrm/utils/sanitizer.py          | 25 ++++++++++
 bookwyrm/views/status.py             |  7 +--
 requirements.txt                     |  1 +
 6 files changed, 44 insertions(+), 102 deletions(-)
 delete mode 100644 bookwyrm/sanitize_html.py
 create mode 100644 bookwyrm/utils/sanitizer.py

diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py
index 62c61cc405..785f3397c9 100644
--- a/bookwyrm/models/fields.py
+++ b/bookwyrm/models/fields.py
@@ -16,7 +16,7 @@
 
 from bookwyrm import activitypub
 from bookwyrm.connectors import get_image
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 from bookwyrm.settings import MEDIA_FULL_URL
 
 
@@ -497,9 +497,7 @@ class HtmlField(ActivitypubFieldMixin, models.TextField):
     def field_from_activity(self, value):
         if not value or value == MISSING:
             return None
-        sanitizer = InputHtmlParser()
-        sanitizer.feed(value)
-        return sanitizer.get_output()
+        return clean(value)
 
 
 class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py
deleted file mode 100644
index 4edd2818eb..0000000000
--- a/bookwyrm/sanitize_html.py
+++ /dev/null
@@ -1,71 +0,0 @@
-""" html parser to clean up incoming text from unknown sources """
-from html.parser import HTMLParser
-
-
-class InputHtmlParser(HTMLParser):  # pylint: disable=abstract-method
-    """Removes any html that isn't allowed_tagsed from a block"""
-
-    def __init__(self):
-        HTMLParser.__init__(self)
-        self.allowed_tags = [
-            "p",
-            "blockquote",
-            "br",
-            "b",
-            "i",
-            "strong",
-            "em",
-            "pre",
-            "a",
-            "span",
-            "ul",
-            "ol",
-            "li",
-        ]
-        self.allowed_attrs = ["href", "rel", "src", "alt"]
-        self.tag_stack = []
-        self.output = []
-        # if the html appears invalid, we just won't allow any at all
-        self.allow_html = True
-
-    def handle_starttag(self, tag, attrs):
-        """check if the tag is valid"""
-        if self.allow_html and tag in self.allowed_tags:
-            allowed_attrs = " ".join(
-                f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
-            )
-            reconstructed = f"<{tag}"
-            if allowed_attrs:
-                reconstructed += " " + allowed_attrs
-            reconstructed += ">"
-            self.output.append(("tag", reconstructed))
-            self.tag_stack.append(tag)
-        else:
-            self.output.append(("data", ""))
-
-    def handle_endtag(self, tag):
-        """keep the close tag"""
-        if not self.allow_html or tag not in self.allowed_tags:
-            self.output.append(("data", ""))
-            return
-
-        if not self.tag_stack or self.tag_stack[-1] != tag:
-            # the end tag doesn't match the most recent start tag
-            self.allow_html = False
-            self.output.append(("data", ""))
-            return
-
-        self.tag_stack = self.tag_stack[:-1]
-        self.output.append(("tag", f"</{tag}>"))
-
-    def handle_data(self, data):
-        """extract the answer, if we're in an answer tag"""
-        self.output.append(("data", data))
-
-    def get_output(self):
-        """convert the output from a list of tuples to a string"""
-        if self.tag_stack:
-            self.allow_html = False
-        if not self.allow_html:
-            return "".join(v for (k, v) in self.output if k == "data")
-        return "".join(v for (k, v) in self.output)
diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py
index 5814f22071..ecdd697937 100644
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@@ -1,7 +1,7 @@
 """ make sure only valid html gets to the app """
 from django.test import TestCase
 
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils.sanitizer import clean
 
 
 class Sanitizer(TestCase):
@@ -10,53 +10,45 @@ class Sanitizer(TestCase):
     def test_no_html(self):
         """just text"""
         input_text = "no      html  "
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html(self):
         """leave the html untouched"""
         input_text = "<b>yes    </b> <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html_attrs(self):
         """and don't remove useful attributes"""
         input_text = '<a href="fish.com">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(input_text, output)
 
     def test_valid_html_invalid_attrs(self):
         """do remove un-approved attributes"""
         input_text = '<a href="fish.com" fish="hello">yes    </a> <i>html</i>'
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual(output, '<a href="fish.com">yes    </a> <i>html</i>')
 
     def test_invalid_html(self):
         """remove all html when the html is malformed"""
         input_text = "<b>yes  <i>html</i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("yes  html", output)
 
         input_text = "yes <i></b>html   </i>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("yes html   ", output)
 
     def test_disallowed_html(self):
         """remove disallowed html but keep allowed html"""
         input_text = "<div>  yes <i>html</i></div>"
-        parser = InputHtmlParser()
-        parser.feed(input_text)
-        output = parser.get_output()
+        output = clean(input_text)
         self.assertEqual("  yes <i>html</i>", output)
+
+    def test_escaped_bracket(self):
+        """remove &gt; and &lt;"""
+        input_text = "&lt;dev&gt;hi&lt;/div&gt;"
+        output = clean(input_text)
+        self.assertEqual("hi", output)
diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py
new file mode 100644
index 0000000000..676921949f
--- /dev/null
+++ b/bookwyrm/utils/sanitizer.py
@@ -0,0 +1,25 @@
+"""Clean user-provided text"""
+import bleach
+
+
+def clean(input_text):
+    """Run through "bleach" """
+    return bleach.clean(
+        input_text,
+        tags=[
+            "p",
+            "blockquote",
+            "br",
+            "b",
+            "i",
+            "strong",
+            "em",
+            "pre",
+            "a",
+            "span",
+            "ul",
+            "ol",
+            "li",
+        ],
+        attributes=["href", "rel", "src", "alt"],
+    )
diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py
index 670ea57173..0dd9e0f800 100644
--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@@ -16,9 +16,8 @@
 
 from markdown import markdown
 from bookwyrm import forms, models
-from bookwyrm.sanitize_html import InputHtmlParser
 from bookwyrm.settings import DOMAIN
-from bookwyrm.utils import regex
+from bookwyrm.utils import regex, sanitizer
 from .helpers import handle_remote_webfinger, is_api_request
 from .helpers import load_date_in_user_tz_as_utc
 
@@ -268,6 +267,4 @@ def to_markdown(content):
     content = format_links(content)
     content = markdown(content)
     # sanitize resulting html
-    sanitizer = InputHtmlParser()
-    sanitizer.feed(content)
-    return sanitizer.get_output()
+    return sanitizer.clean(content)
diff --git a/requirements.txt b/requirements.txt
index 0155782cc8..3d9f686aeb 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 aiohttp==3.8.1
+bleach==5.0.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13

From 62aa4bf869625186e0c7f24d259cfb2caac6f4d3 Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:21:18 -0700
Subject: [PATCH 2/5] Tick version number

---
 bookwyrm/settings.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py
index dc0d71f30a..e67fb5e1e0 100644
--- a/bookwyrm/settings.py
+++ b/bookwyrm/settings.py
@@ -11,7 +11,7 @@
 env = Env()
 env.read_env()
 DOMAIN = env("DOMAIN")
-VERSION = "0.4.0"
+VERSION = "0.4.1"
 
 RELEASE_API = env(
     "RELEASE_API",

From 13376f89708fea095d97fdeed6950da701aa52d9 Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:24:13 -0700
Subject: [PATCH 3/5] Catches missing reference to previous sanitizer

---
 bookwyrm/status.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/bookwyrm/status.py b/bookwyrm/status.py
index 09fbdc06e6..de7682ee72 100644
--- a/bookwyrm/status.py
+++ b/bookwyrm/status.py
@@ -2,15 +2,13 @@
 from django.db import transaction
 
 from bookwyrm import models
-from bookwyrm.sanitize_html import InputHtmlParser
+from bookwyrm.utils import sanitizer
 
 
 def create_generated_note(user, content, mention_books=None, privacy="public"):
     """a note created by the app about user activity"""
     # sanitize input html
-    parser = InputHtmlParser()
-    parser.feed(content)
-    content = parser.get_output()
+    content = sanitizer.clean(content)
 
     with transaction.atomic():
         # create but don't save

From 70beb24d95657d5186d73d71599088046dd5e891 Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:34:09 -0700
Subject: [PATCH 4/5] Removed misleading test

This wasn't really testing what I wanted it to.
---
 bookwyrm/tests/test_sanitize_html.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py
index ecdd697937..ca1643e8f5 100644
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@@ -46,9 +46,3 @@ def test_disallowed_html(self):
         input_text = "<div>  yes <i>html</i></div>"
         output = clean(input_text)
         self.assertEqual("  yes <i>html</i>", output)
-
-    def test_escaped_bracket(self):
-        """remove &gt; and &lt;"""
-        input_text = "&lt;dev&gt;hi&lt;/div&gt;"
-        output = clean(input_text)
-        self.assertEqual("hi", output)

From 9d9b7f366a2b6b711c5b6f3609286a26f98486de Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 4 Jul 2022 13:45:28 -0700
Subject: [PATCH 5/5] Use "strip" in bleach

This removes forbidden html, rather than leaving them in place but
unrendered.
---
 bookwyrm/tests/test_sanitize_html.py | 6 +++---
 bookwyrm/utils/sanitizer.py          | 1 +
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py
index ca1643e8f5..449acdafbe 100644
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@@ -32,14 +32,14 @@ def test_valid_html_invalid_attrs(self):
         self.assertEqual(output, '<a href="fish.com">yes    </a> <i>html</i>')
 
     def test_invalid_html(self):
-        """remove all html when the html is malformed"""
+        """don't allow malformed html"""
         input_text = "<b>yes  <i>html</i>"
         output = clean(input_text)
-        self.assertEqual("yes  html", output)
+        self.assertEqual("<b>yes  <i>html</i></b>", output)
 
         input_text = "yes <i></b>html   </i>"
         output = clean(input_text)
-        self.assertEqual("yes html   ", output)
+        self.assertEqual("yes <i>html   </i>", output)
 
     def test_disallowed_html(self):
         """remove disallowed html but keep allowed html"""
diff --git a/bookwyrm/utils/sanitizer.py b/bookwyrm/utils/sanitizer.py
index 676921949f..f6c87358cb 100644
--- a/bookwyrm/utils/sanitizer.py
+++ b/bookwyrm/utils/sanitizer.py
@@ -22,4 +22,5 @@ def clean(input_text):
             "li",
         ],
         attributes=["href", "rel", "src", "alt"],
+        strip=True,
     )