expanded built-in markdown module with more standard features

blade-lang · Jul 9, 2023 · 7bff6e0 · 7bff6e0
1 parent 8cc6912
commit 7bff6e0
Show file tree

Hide file tree

Showing 8 changed files with 258 additions and 6 deletions.
diff --git a/libs/markdown/README.md b/libs/markdown/README.md
@@ -2,11 +2,14 @@
 
 This module is based on the [markdown-it](https://github.com/markdown-it/markdown-it) library by Vitaly Puzrin and Alex Kocharin.
 
-It has been extended with support for the following by default:
+It has been extended with support for the following by default in the standard (non-commonmark) mode:
 
 - anchors
-- emoji
-- ins
+- emojies
+- insert
+- subscripts
+- superscripts
+- mark
 
 
 ## Licenses

diff --git a/libs/markdown/common/utils.b b/libs/markdown/common/utils.b
@@ -39,6 +39,8 @@ def is_valid_entity_code(c) {
 var UNESCAPE_MD_RE  = '\\\\([\\\\!"#$%&\'()*+,.\\/:;<=>?@[\\]^_`{|}~-])'
 var ENTITY_RE       = '&([a-z#][a-z0-9]{1,31});'
 var UNESCAPE_ALL_RE = '/' + UNESCAPE_MD_RE + '|' + ENTITY_RE + '/si'
+var UNESCAPE_RE = '/\\\\([ \\\\!"#$%&\'()*+,.\/:;<=>?@[\]^_`{|}~-])/'
+var UNESCAPE_SPACE_RE = '/(^|[^\\\\])(\\\\\\\\)*\s/'
 
 var DIGITAL_ENTITY_TEST_RE = '/^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))$/i'
 

diff --git a/libs/markdown/inline/index.b b/libs/markdown/inline/index.b
@@ -13,7 +13,10 @@ import .linkify
 import .newline
 import .strikethrough
 import .text
-import .ins
+import .insert
+import .subscript
+import .superscript
+import .mark
 
 # classes
 import .inline_state
diff --git a/libs/markdown/inline/ins.b → libs/markdown/inline/insert.b b/libs/markdown/inline/ins.b → libs/markdown/inline/insert.b
diff --git a/libs/markdown/inline/mark.b b/libs/markdown/inline/mark.b
@@ -0,0 +1,127 @@
+# Insert each marker as a separate text token, and add it to delimiter list
+#
+def tokenize(state, silent) {
+  var i, scanned, token, len, ch,
+      start = state.pos,
+      marker = state.src[start]
+
+  if silent return false
+
+  if marker != '=' return false
+
+  scanned = state.scan_delims(state.pos, true)
+  len = scanned.length
+  ch = marker
+
+  if len < 2 return false
+
+  if len % 2 > 0 {
+    token         = state.push('text', '', 0)
+    token.content = ch
+    len--
+  }
+
+  iter i = 0; i < len; i += 2 {
+    token         = state.push('text', '', 0)
+    token.content = ch + ch
+
+    if !scanned.can_open and !scanned.can_close continue
+
+    state.delimiters.append({
+      marker: marker,
+      length: 0,     # disable "rule of 3" length checks meant for emphasis
+      jump:   i / 2, # 1 delimiter = 2 characters
+      token:  state.tokens.length() - 1,
+      end:    -1,
+      open:   scanned.can_open,
+      close:  scanned.can_close
+    })
+  }
+
+  state.pos += scanned.length
+
+  return true
+}
+
+
+# Walk through delimiter list and replace text tokens with tags
+#
+def _post_process(state, delimiters) {
+  var i, j,
+      start_delim,
+      end_delim,
+      token,
+      lone_markers = [],
+      max = delimiters.length()
+
+  iter i = 0; i < max; i++ {
+    start_delim = delimiters[i]
+
+    if start_delim.marker != '=' {
+      continue
+    }
+
+    if start_delim.end == -1 {
+      continue
+    }
+
+    end_delim = delimiters[start_delim.end]
+
+    token         = state.tokens[start_delim.token]
+    token.type    = 'mark_open'
+    token.tag     = 'mark'
+    token.nesting = 1
+    token.markup  = '=='
+    token.content = ''
+
+    token         = state.tokens[end_delim.token]
+    token.type    = 'mark_close'
+    token.tag     = 'mark'
+    token.nesting = -1
+    token.markup  = '=='
+    token.content = ''
+
+    if (state.tokens[end_delim.token - 1].type == 'text' and
+        state.tokens[end_delim.token - 1].content == '=') {
+
+      lone_markers.append(end_delim.token - 1)
+    }
+  }
+
+  # If a marker sequence has an odd number of characters, it's splitted
+  # like this: `~~~~~` -> `~` + `~~` + `~~`, leaving one marker at the
+  # start of the sequence.
+  #
+  # So, we have to move all those markers after subsequent s_close tags.
+  #
+  while lone_markers.length() > 0 {
+    i = lone_markers.pop()
+    j = i + 1
+
+    while j < state.tokens.length and state.tokens[j].type == 'mark_close' {
+      j++
+    }
+
+    j--
+
+    if i != j {
+      token = state.tokens[j]
+      state.tokens[j] = state.tokens[i]
+      state.tokens[i] = token
+    }
+  }
+}
+
+def post_process(state) {
+  var curr,
+      tokens_meta = state.tokens_meta,
+      max = (state.tokens_meta or []).length()
+
+  _post_process(state, state.delimiters)
+
+  iter curr = 0; curr < max; curr++ {
+    if tokens_meta[curr] and tokens_meta[curr].delimiters {
+      _post_process(state, tokens_meta[curr].delimiters)
+    }
+  }
+}
diff --git a/libs/markdown/inline/subscript.b b/libs/markdown/inline/subscript.b
@@ -0,0 +1,57 @@
+import ..common.utils { UNESCAPE_RE, UNESCAPE_SPACE_RE }
+
+
+def subscript(state, silent) {
+  var found,
+      content,
+      token,
+      max = state.pos_max,
+      start = state.pos
+
+  if state.src[start] != '~' return false
+  if silent return false # don't run any pairs in validation mode
+  if start + 2 >= max return false
+
+  state.pos = start + 1
+
+  while state.pos < max {
+    if state.src[state.pos] == '~' {
+      found = true
+      break
+    }
+
+    state.md.inline.skip_token(state)
+  }
+
+  if !found or start + 1 == state.pos {
+    state.pos = start
+    return false
+  }
+
+  content = state.src[start + 1, state.pos]
+
+  # don't allow unescaped spaces/newlines inside
+  if content.match(UNESCAPE_SPACE_RE) {
+    state.pos = start
+    return false
+  }
+
+  # found!
+  state.pos_max = state.pos
+  state.pos = start + 1
+
+  # Earlier we checked !silent, but this implementation does not need it
+  token         = state.push('sub_open', 'sub', 1)
+  token.markup  = '~'
+
+  token         = state.push('text', '', 0)
+  token.content = content.replace(UNESCAPE_RE, '$1')
+
+  token         = state.push('sub_close', 'sub', -1)
+  token.markup  = '~'
+
+  state.pos = state.pos_max + 1
+  state.pos_max = max
+  return true
+}
+
diff --git a/libs/markdown/inline/superscript.b b/libs/markdown/inline/superscript.b
@@ -0,0 +1,56 @@
+import ..common.utils { UNESCAPE_RE, UNESCAPE_SPACE_RE }
+
+def superscript(state, silent) {
+  var found,
+      content,
+      token,
+      max = state.pos_max,
+      start = state.pos
+
+  if state.src[start] != '^' return false
+  if silent return false # don't run any pairs in validation mode
+  if start + 2 >= max return false
+
+  state.pos = start + 1
+
+  while state.pos < max {
+    if state.src[state.pos] == '^' {
+      found = true
+      break
+    }
+
+    state.md.inline.skip_token(state)
+  }
+
+  if !found or start + 1 == state.pos {
+    state.pos = start
+    return false
+  }
+
+  content = state.src[start + 1, state.pos]
+
+  # don't allow unescaped spaces/newlines inside
+  if content.match(UNESCAPE_SPACE_RE) {
+    state.pos = start
+    return false
+  }
+
+  # found!
+  state.pos_max = state.pos
+  state.pos = start + 1
+
+  # Earlier we checked !silent, but this implementation does not need it
+  token         = state.push('sup_open', 'sup', 1)
+  token.markup  = '^'
+
+  token         = state.push('text', '', 0)
+  token.content = content.replace(UNESCAPE_RE, '$1')
+
+  token         = state.push('sup_close', 'sup', -1)
+  token.markup  = '^'
+
+  state.pos = state.pos_max + 1
+  state.pos_max = max
+  return true
+}
+
diff --git a/libs/markdown/parser_inline.b b/libs/markdown/parser_inline.b
@@ -10,8 +10,11 @@ var _rules = [
   [ 'escape',          inline.escape ],
   [ 'backticks',       inline.backticks ],
   [ 'strikethrough',   inline.strikethrough.tokenize ],
-  [ 'ins',             inline.ins.tokenize ],
+  [ 'insert',          inline.insert.tokenize ],
+  [ 'mark',            inline.mark.tokenize ],
   [ 'emphasis',        inline.emphasis.tokenize ],
+  [ 'subscript',       inline.subscript ],
+  [ 'superscript',     inline.superscript ],
   [ 'link',            inline.link ],
   [ 'image',           inline.image ],
   [ 'autolink',        inline.autolink ],
@@ -26,7 +29,8 @@ var _rules = [
 var _rules2 = [
   [ 'balance_pairs',   inline.balance_pairs ],
   [ 'strikethrough',   inline.strikethrough.post_process ],
-  [ 'ins',             inline.ins.post_process ],
+  [ 'insert',          inline.insert.post_process ],
+  [ 'mark',            inline.mark.post_process ],
   [ 'emphasis',        inline.emphasis.post_process ],
   # rules for pairs separate '**' into its own text tokens, which may be left unused,
   # rule below merges unused segments back with the rest of the text