Merge pull request #538 from Crozzers/mixed-header-ids

Include HTML headers in TOC
trentm · Dec 1, 2023 · efd824f · efd824f
2 parents 182b7f5 + b93c41e
commit efd824f
Show file tree

Hide file tree

Showing 6 changed files with 98 additions and 1 deletion.
diff --git a/CHANGES.md b/CHANGES.md
@@ -8,6 +8,7 @@
 - [pull #532] Fix #493 persisting when `code-friendly` extra enabled
 - [pull #535] Update `_slugify` to use utf-8 encoding (issue #534)
 - [pull #536] Maintain order of appearance in footnotes
+- [pull #538] Include HTML headers in TOC
 
 ## python-markdown2 2.4.10
 

diff --git a/lib/markdown2.py b/lib/markdown2.py
@@ -240,6 +240,13 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
             else:
                 self._toc_depth = self.extras["toc"].get("depth", 6)
 
+        if 'header-ids' in self.extras:
+            if not isinstance(self.extras['header-ids'], dict):
+                self.extras['header-ids'] = {
+                    'mixed': False,
+                    'prefix': self.extras['header-ids']
+                }
+
         if 'break-on-newline' in self.extras:
             self.extras.setdefault('breaks', {})
             self.extras['breaks']['on_newline'] = True
@@ -424,6 +431,17 @@ def convert(self, text):
             text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)
 
         if "toc" in self.extras and self._toc:
+            if self.extras['header-ids'].get('mixed'):
+                # TOC will only be out of order if mixed headers is enabled
+                def toc_sort(entry):
+                    '''Sort the TOC by order of appearance in text'''
+                    return re.search(
+                        # header tag, any attrs, the ID, any attrs, the text, close tag
+                        r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
+                        text, re.M
+                    ).start()
+
+                self._toc.sort(key=toc_sort)
             self._toc_html = calculate_toc_html(self._toc)
 
             # Prepend toc html to output
@@ -783,6 +801,8 @@ def _hash_html_block_sub(self, match, raw=False):
                 return ''.join(["\n\n", f_key,
                     "\n\n", middle, "\n\n",
                     l_key, "\n\n"])
+        elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
+            html = self._h_tag_re.sub(self._h_tag_sub, html)
         key = _hash_text(html)
         self.html_blocks[key] = html
         return "\n\n" + key + "\n\n"
@@ -1786,6 +1806,13 @@ def header_id_from_text(self, text, prefix, n):
 
         return header_id
 
+    def _header_id_exists(self, text):
+        header_id = _slugify(text)
+        prefix = self.extras['header-ids'].get('prefix')
+        if prefix and isinstance(prefix, str):
+            header_id = prefix + '-' + header_id
+        return header_id in self._count_from_header_id
+
     def _toc_add_entry(self, level, id, name):
         if level > self._toc_depth:
             return
@@ -1810,6 +1837,7 @@ def _toc_add_entry(self, level, id, name):
     _h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)
 
     def _h_sub(self, match):
+        '''Handles processing markdown headers'''
         if match.group(1) is not None and match.group(3) == "-":
             return match.group(1)
         elif match.group(1) is not None:
@@ -1827,14 +1855,45 @@ def _h_sub(self, match):
         header_id_attr = ""
         if "header-ids" in self.extras:
             header_id = self.header_id_from_text(header_group,
-                self.extras["header-ids"], n)
+                self.extras["header-ids"].get('prefix'), n)
             if header_id:
                 header_id_attr = ' id="%s"' % header_id
         html = self._run_span_gamut(header_group)
         if "toc" in self.extras and header_id:
             self._toc_add_entry(n, header_id, html)
         return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)
 
+    _h_tag_re = re.compile(r'''
+        ^<h([1-6])(.*)>  # \1 tag num, \2 attrs
+        (.*)  # \3 text
+        </h\1>
+    ''', re.X | re.M)
+
+    def _h_tag_sub(self, match):
+        '''Different to `_h_sub` in that this function handles existing HTML headers'''
+        text = match.string[match.start(): match.end()]
+        h_level = int(match.group(1))
+        # extract id= attr from tag, trying to account for regex "misses"
+        id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
+        if id_attr:
+            # if id attr exists, extract that
+            id_attr = id_attr.group(1) or ''
+        id_attr = id_attr.strip('\'" ')
+        h_text = match.group(3)
+
+        # check if header was already processed (ie: was a markdown header rather than HTML)
+        if id_attr and self._header_id_exists(id_attr):
+            return text
+
+        # generate new header id if none existed
+        header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
+        if "toc" in self.extras:
+            self._toc_add_entry(h_level, header_id, h_text)
+        if header_id and not id_attr:
+            # '<h[digit]' + new ID + '...'
+            return text[:3] + ' id="%s"' % header_id + text[3:]
+        return text
+
     def _do_headers(self, text):
         # Setext-style headers:
         #     Header 1

diff --git a/test/tm-cases/mixed_header_ids.html b/test/tm-cases/mixed_header_ids.html
@@ -0,0 +1,11 @@
+<h1 id="header-1">Header 1</h1>
+
+<h2 id="header-2">Header 2</h2>
+
+<h1 id="header-3">Header 3</h1>
+
+<h4 id="header-4" class="myclass">Header 4</h4>
+
+<h1 id="header-5">Header 5</h1>
+
+<h6 id="my-important-id">Header 6</h6>
diff --git a/test/tm-cases/mixed_header_ids.opts b/test/tm-cases/mixed_header_ids.opts
@@ -0,0 +1 @@
+{"extras": {"header-ids": {"mixed": True}, "toc": None}}
diff --git a/test/tm-cases/mixed_header_ids.text b/test/tm-cases/mixed_header_ids.text
@@ -0,0 +1,11 @@
+# Header 1
+
+<h2>Header 2</h2>
+
+# Header 3
+
+<h4 class="myclass">Header 4</h4>
+
+# Header 5
+
+<h6 id="my-important-id">Header 6</h6>
diff --git a/test/tm-cases/mixed_header_ids.toc_html b/test/tm-cases/mixed_header_ids.toc_html
@@ -0,0 +1,14 @@
+<ul>
+  <li><a href="#header-1">Header 1</a>
+  <ul>
+    <li><a href="#header-2">Header 2</a></li>
+  </ul></li>
+  <li><a href="#header-3">Header 3</a>
+  <ul>
+    <li><a href="#header-4">Header 4</a></li>
+  </ul></li>
+  <li><a href="#header-5">Header 5</a>
+  <ul>
+    <li><a href="#my-important-id">Header 6</a></li>
+  </ul></li>
+</ul>