Skip to content

Commit

Permalink
Merge pull request #538 from Crozzers/mixed-header-ids
Browse files Browse the repository at this point in the history
Include HTML headers in TOC
  • Loading branch information
nicholasserra committed Dec 1, 2023
2 parents 182b7f5 + b93c41e commit efd824f
Show file tree
Hide file tree
Showing 6 changed files with 98 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGES.md
Expand Up @@ -8,6 +8,7 @@
- [pull #532] Fix #493 persisting when `code-friendly` extra enabled
- [pull #535] Update `_slugify` to use utf-8 encoding (issue #534)
- [pull #536] Maintain order of appearance in footnotes
- [pull #538] Include HTML headers in TOC

## python-markdown2 2.4.10

Expand Down
61 changes: 60 additions & 1 deletion lib/markdown2.py
Expand Up @@ -240,6 +240,13 @@ def __init__(self, html4tags=False, tab_width=4, safe_mode=None,
else:
self._toc_depth = self.extras["toc"].get("depth", 6)

if 'header-ids' in self.extras:
if not isinstance(self.extras['header-ids'], dict):
self.extras['header-ids'] = {
'mixed': False,
'prefix': self.extras['header-ids']
}

if 'break-on-newline' in self.extras:
self.extras.setdefault('breaks', {})
self.extras['breaks']['on_newline'] = True
Expand Down Expand Up @@ -424,6 +431,17 @@ def convert(self, text):
text = self._a_nofollow_or_blank_links.sub(r'<\1 rel="nofollow"\2', text)

if "toc" in self.extras and self._toc:
if self.extras['header-ids'].get('mixed'):
# TOC will only be out of order if mixed headers is enabled
def toc_sort(entry):
'''Sort the TOC by order of appearance in text'''
return re.search(
# header tag, any attrs, the ID, any attrs, the text, close tag
r'^<(h%d).*?id=(["\'])%s\2.*>%s</\1>$' % (entry[0], entry[1], re.escape(entry[2])),
text, re.M
).start()

self._toc.sort(key=toc_sort)
self._toc_html = calculate_toc_html(self._toc)

# Prepend toc html to output
Expand Down Expand Up @@ -783,6 +801,8 @@ def _hash_html_block_sub(self, match, raw=False):
return ''.join(["\n\n", f_key,
"\n\n", middle, "\n\n",
l_key, "\n\n"])
elif self.extras.get('header-ids', {}).get('mixed') and self._h_tag_re.match(html):
html = self._h_tag_re.sub(self._h_tag_sub, html)
key = _hash_text(html)
self.html_blocks[key] = html
return "\n\n" + key + "\n\n"
Expand Down Expand Up @@ -1786,6 +1806,13 @@ def header_id_from_text(self, text, prefix, n):

return header_id

def _header_id_exists(self, text):
header_id = _slugify(text)
prefix = self.extras['header-ids'].get('prefix')
if prefix and isinstance(prefix, str):
header_id = prefix + '-' + header_id
return header_id in self._count_from_header_id

def _toc_add_entry(self, level, id, name):
if level > self._toc_depth:
return
Expand All @@ -1810,6 +1837,7 @@ def _toc_add_entry(self, level, id, name):
_h_re_tag_friendly = re.compile(_h_re_base % '+', re.X | re.M)

def _h_sub(self, match):
'''Handles processing markdown headers'''
if match.group(1) is not None and match.group(3) == "-":
return match.group(1)
elif match.group(1) is not None:
Expand All @@ -1827,14 +1855,45 @@ def _h_sub(self, match):
header_id_attr = ""
if "header-ids" in self.extras:
header_id = self.header_id_from_text(header_group,
self.extras["header-ids"], n)
self.extras["header-ids"].get('prefix'), n)
if header_id:
header_id_attr = ' id="%s"' % header_id
html = self._run_span_gamut(header_group)
if "toc" in self.extras and header_id:
self._toc_add_entry(n, header_id, html)
return "<h%d%s>%s</h%d>\n\n" % (n, header_id_attr, html, n)

_h_tag_re = re.compile(r'''
^<h([1-6])(.*)> # \1 tag num, \2 attrs
(.*) # \3 text
</h\1>
''', re.X | re.M)

def _h_tag_sub(self, match):
'''Different to `_h_sub` in that this function handles existing HTML headers'''
text = match.string[match.start(): match.end()]
h_level = int(match.group(1))
# extract id= attr from tag, trying to account for regex "misses"
id_attr = (re.match(r'.*?id=(\S+)?.*', match.group(2) or '') or '')
if id_attr:
# if id attr exists, extract that
id_attr = id_attr.group(1) or ''
id_attr = id_attr.strip('\'" ')
h_text = match.group(3)

# check if header was already processed (ie: was a markdown header rather than HTML)
if id_attr and self._header_id_exists(id_attr):
return text

# generate new header id if none existed
header_id = id_attr or self.header_id_from_text(h_text, self.extras['header-ids'].get('prefix'), h_level)
if "toc" in self.extras:
self._toc_add_entry(h_level, header_id, h_text)
if header_id and not id_attr:
# '<h[digit]' + new ID + '...'
return text[:3] + ' id="%s"' % header_id + text[3:]
return text

def _do_headers(self, text):
# Setext-style headers:
# Header 1
Expand Down
11 changes: 11 additions & 0 deletions test/tm-cases/mixed_header_ids.html
@@ -0,0 +1,11 @@
<h1 id="header-1">Header 1</h1>

<h2 id="header-2">Header 2</h2>

<h1 id="header-3">Header 3</h1>

<h4 id="header-4" class="myclass">Header 4</h4>

<h1 id="header-5">Header 5</h1>

<h6 id="my-important-id">Header 6</h6>
1 change: 1 addition & 0 deletions test/tm-cases/mixed_header_ids.opts
@@ -0,0 +1 @@
{"extras": {"header-ids": {"mixed": True}, "toc": None}}
11 changes: 11 additions & 0 deletions test/tm-cases/mixed_header_ids.text
@@ -0,0 +1,11 @@
# Header 1

<h2>Header 2</h2>

# Header 3

<h4 class="myclass">Header 4</h4>

# Header 5

<h6 id="my-important-id">Header 6</h6>
14 changes: 14 additions & 0 deletions test/tm-cases/mixed_header_ids.toc_html
@@ -0,0 +1,14 @@
<ul>
<li><a href="#header-1">Header 1</a>
<ul>
<li><a href="#header-2">Header 2</a></li>
</ul></li>
<li><a href="#header-3">Header 3</a>
<ul>
<li><a href="#header-4">Header 4</a></li>
</ul></li>
<li><a href="#header-5">Header 5</a>
<ul>
<li><a href="#my-important-id">Header 6</a></li>
</ul></li>
</ul>

0 comments on commit efd824f

Please sign in to comment.